AI & ML interests

None defined yet.

This organization hosts models predicting seven personal qualities from students admissions essays. Each model is a text classifier that takes as input some text and outputs probabilites for each personal quality.

Here is example code for running these models. Adapted from j-hartmann/emotion-english-distilroberta-base great model. It will run a loop where batch_size examples are passed to the model, and results get iteratively added to a file. This is to accomodate potential system crashes. Best to save progress every once in a while rather than have it all be lost.

If you do experience a crash, make sure to avoid overwriting it when running the script again (see comment for intialize resutls file).

# import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import time

# User Input
model_name = "pqadmissions/perseverance_1"  # specify the name of the model
file_name = "input.csv"  # specify the name of the input file (needs an id column and a text column)
output_file = "output.csv"  # specify the name of the output file
id_column = "id" # select the column in your csv that contains the unique id for each text
text_column = "text"  # select the column in your csv that contains the text to be classified
batch_size = 10_000  # select the batch size before each save

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}
      
# load tokenizer and model, create trainer
print("downloading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

# read in csv
print("reading in csv...")
df_pred = pd.read_csv(file_name)
all_ids = df_pred[id_column].tolist()

# initialize results file
pd.DataFrame(columns=['id','text','prediction','score']).to_csv(output_file, index=False)

results = pd.read_csv(output_file)
ids_ran = results[id_column].tolist()
ids_to_run = np.setdiff1d(all_ids, ids_ran).tolist()

while(len(ids_to_run) > 0):
    start_time = time.time()
    print("preparing data...")
    # Make a subset of csv based on results so far
    results = pd.read_csv(output_file)
    ids_ran = results[id_column].tolist()
    ids_to_run = np.setdiff1d(all_ids, ids_ran).tolist()
    if(len(ids_to_run) < batch_size):
        batch_size = len(ids_to_run)
    pred_sample = df_pred[df_pred[id_column].isin(ids_to_run)].sample(batch_size)
    pred_texts = pred_sample[text_column].dropna().astype('str').tolist()

    # Tokenize texts and create prediction data set
    print("tokenizing texts...")
    tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
    pred_dataset = SimpleDataset(tokenized_texts)

    # Run predictions
    print("getting predictions...")
    predictions = trainer.predict(pred_dataset)

    # Transform predictions to labels
    print("making dataset...")
    preds = predictions.predictions.argmax(-1)
    # add ids to merge back to original data
    ids = pred_sample[id_column].tolist()
    

    # scores raw
    probabilities = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))
    scores = probabilities.T[-1]
    
    # Create DataFrame with texts, predictions, labels, and scores
    df = pd.DataFrame(list(zip(ids,pred_texts,preds,scores)), columns=['id',"text",'prediction','score'])
    df.head()

    # save results to csv
    print("saving results...")
    df.to_csv(output_file, index=False, header=False, mode = 'a')
    end_time = time.time()
    print(f"Time elapsed: {end_time - start_time} seconds")
    print(f"Number of texts left: {len(ids_to_run) - batch_size}")
    if len(ids_to_run) - batch_size == 0:
        print("Done!")
        break

models 0

None public yet

datasets 0

None public yet