AI & ML interests
None defined yet.
Organization Card
This organization hosts models predicting seven personal qualities from students admissions essays. Each model is a text classifier that takes as input some text and outputs probabilites for each personal quality.
Here is example code for running these models.
Adapted from j-hartmann/emotion-english-distilroberta-base great model.
It will run a loop where batch_size examples are passed to the model, and results get iteratively added to a file.
This is to accomodate potential system crashes.
Best to save progress every once in a while rather than have it all be lost.
If you do experience a crash, make sure to avoid overwriting it when running the script again (see comment for intialize resutls file).
# import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import time
# User Input
model_name = "pqadmissions/perseverance_1" # specify the name of the model
file_name = "input.csv" # specify the name of the input file (needs an id column and a text column)
output_file = "output.csv" # specify the name of the output file
id_column = "id" # select the column in your csv that contains the unique id for each text
text_column = "text" # select the column in your csv that contains the text to be classified
batch_size = 10_000 # select the batch size before each save
# Create class for data preparation
class SimpleDataset:
def __init__(self, tokenized_texts):
self.tokenized_texts = tokenized_texts
def __len__(self):
return len(self.tokenized_texts["input_ids"])
def __getitem__(self, idx):
return {k: v[idx] for k, v in self.tokenized_texts.items()}
# load tokenizer and model, create trainer
print("downloading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)
# read in csv
print("reading in csv...")
df_pred = pd.read_csv(file_name)
all_ids = df_pred[id_column].tolist()
# initialize results file
pd.DataFrame(columns=['id','text','prediction','score']).to_csv(output_file, index=False)
results = pd.read_csv(output_file)
ids_ran = results[id_column].tolist()
ids_to_run = np.setdiff1d(all_ids, ids_ran).tolist()
while(len(ids_to_run) > 0):
start_time = time.time()
print("preparing data...")
# Make a subset of csv based on results so far
results = pd.read_csv(output_file)
ids_ran = results[id_column].tolist()
ids_to_run = np.setdiff1d(all_ids, ids_ran).tolist()
if(len(ids_to_run) < batch_size):
batch_size = len(ids_to_run)
pred_sample = df_pred[df_pred[id_column].isin(ids_to_run)].sample(batch_size)
pred_texts = pred_sample[text_column].dropna().astype('str').tolist()
# Tokenize texts and create prediction data set
print("tokenizing texts...")
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)
# Run predictions
print("getting predictions...")
predictions = trainer.predict(pred_dataset)
# Transform predictions to labels
print("making dataset...")
preds = predictions.predictions.argmax(-1)
# add ids to merge back to original data
ids = pred_sample[id_column].tolist()
# scores raw
probabilities = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True))
scores = probabilities.T[-1]
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(ids,pred_texts,preds,scores)), columns=['id',"text",'prediction','score'])
df.head()
# save results to csv
print("saving results...")
df.to_csv(output_file, index=False, header=False, mode = 'a')
end_time = time.time()
print(f"Time elapsed: {end_time - start_time} seconds")
print(f"Number of texts left: {len(ids_to_run) - batch_size}")
if len(ids_to_run) - batch_size == 0:
print("Done!")
break
models
0
None public yet
datasets
0
None public yet