Remove dependence on Spacy
Browse files
models.py
CHANGED
|
@@ -4,9 +4,6 @@ import streamlit as st
|
|
| 4 |
from keybert import KeyBERT
|
| 5 |
|
| 6 |
|
| 7 |
-
import spacy
|
| 8 |
-
nlp = spacy.load('en_core_web_sm')
|
| 9 |
-
|
| 10 |
# Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
|
| 11 |
def create_nest_sentences(document:str, token_max_length = 1024):
|
| 12 |
nested = []
|
|
@@ -15,7 +12,7 @@ def create_nest_sentences(document:str, token_max_length = 1024):
|
|
| 15 |
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
|
| 16 |
tokens = nlp(document)
|
| 17 |
|
| 18 |
-
for sentence in
|
| 19 |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
| 20 |
length += len(tokens_in_sentence)
|
| 21 |
|
|
|
|
| 4 |
from keybert import KeyBERT
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
| 7 |
# Reference: https://discuss.huggingface.co/t/summarization-on-long-documents/920/7
|
| 8 |
def create_nest_sentences(document:str, token_max_length = 1024):
|
| 9 |
nested = []
|
|
|
|
| 12 |
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
|
| 13 |
tokens = nlp(document)
|
| 14 |
|
| 15 |
+
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
|
| 16 |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
| 17 |
length += len(tokens_in_sentence)
|
| 18 |
|