Spaces:
Sleeping
Sleeping
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| import pandas as pd | |
| # read in an essay and resturns a df in sentence level | |
| def essay_to_sent(essay): | |
| sentences = [] | |
| paragraphs = [l for l in essay.split('\n') if len(l) > 0] | |
| for para in paragraphs: | |
| # tokenize paragraph by "." and concatenate to sentences[] | |
| sentences.extend(sent_tokenize(para)) | |
| return sentences | |
| ###################### | |
| # prerequisite: | |
| # 1. Pip install transformer | |
| # 2. Define tokenizer + MAX_LEN | |
| # 3. Construct DistillBERTClass_SL class | |
| # 4. Construct Triage_SL class | |
| # 5. Define predict__SL class | |
| # 6. Load model_SL & call eval() | |
| # 7. Pre_define predict_params_SL | |
| #################### | |
| from transformers import DistilBertTokenizer | |
| from transformers import pipeline | |
| tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') | |
| def predict_mainidea_sent(paragraph, model): | |
| # prepare data | |
| sentences = essay_to_sent(paragraph) | |
| pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cpu") | |
| probability_score = pipe(sentences, batch_size=8, function_to_apply="sigmoid") | |
| labels = [score['score'] > 0.5 for score in probability_score] | |
| return pd.DataFrame([(str(l), s) for l, s in zip(labels, sentences)], columns=['label', 'sentence']) | |