Remove extra spaces
Browse files
models.py
CHANGED
|
@@ -15,14 +15,12 @@ def create_nest_sentences(document:str, token_max_length = 1024):
|
|
| 15 |
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
|
| 16 |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
| 17 |
length += len(tokens_in_sentence)
|
| 18 |
-
|
| 19 |
if length < token_max_length:
|
| 20 |
sent.append(sentence)
|
| 21 |
else:
|
| 22 |
nested.append(sent)
|
| 23 |
sent = [sentence]
|
| 24 |
length = 0
|
| 25 |
-
|
| 26 |
if sent:
|
| 27 |
nested.append(sent)
|
| 28 |
return nested
|
|
@@ -42,8 +40,6 @@ def keyword_gen(kw_model, sequence:str):
|
|
| 42 |
top_n=10)
|
| 43 |
return keywords
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
| 47 |
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
| 48 |
@st.cache_resource
|
| 49 |
def load_summary_model():
|
|
@@ -69,7 +65,6 @@ def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:
|
|
| 69 |
no_repeat_ngram_size=3)
|
| 70 |
return output[0].get('summary_text')
|
| 71 |
|
| 72 |
-
|
| 73 |
# # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
|
| 74 |
# # Custom summarization pipeline (to handle long articles)
|
| 75 |
# def summarize(text, minimum_length_of_summary = 100):
|
|
@@ -80,7 +75,6 @@ def summarizer_gen(summarizer, sequence:str, maximum_tokens:int, minimum_tokens:
|
|
| 80 |
# # Untokenize
|
| 81 |
# return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])
|
| 82 |
|
| 83 |
-
|
| 84 |
# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
|
| 85 |
@st.cache_resource
|
| 86 |
def load_model():
|
|
@@ -93,4 +87,3 @@ def load_model():
|
|
| 93 |
def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
|
| 94 |
outputs = classifier(sequence, labels, multi_label=multi_class)
|
| 95 |
return outputs['labels'], outputs['scores']
|
| 96 |
-
|
|
|
|
| 15 |
for sentence in re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', document.replace("\n", ' ')):
|
| 16 |
tokens_in_sentence = tokenizer(str(sentence), truncation=False, padding=False)[0] # hugging face transformer tokenizer
|
| 17 |
length += len(tokens_in_sentence)
|
|
|
|
| 18 |
if length < token_max_length:
|
| 19 |
sent.append(sentence)
|
| 20 |
else:
|
| 21 |
nested.append(sent)
|
| 22 |
sent = [sentence]
|
| 23 |
length = 0
|
|
|
|
| 24 |
if sent:
|
| 25 |
nested.append(sent)
|
| 26 |
return nested
|
|
|
|
| 40 |
top_n=10)
|
| 41 |
return keywords
|
| 42 |
|
|
|
|
|
|
|
| 43 |
# Reference: https://huggingface.co/facebook/bart-large-mnli
|
| 44 |
@st.cache_resource
|
| 45 |
def load_summary_model():
|
|
|
|
| 65 |
no_repeat_ngram_size=3)
|
| 66 |
return output[0].get('summary_text')
|
| 67 |
|
|
|
|
| 68 |
# # Reference: https://www.datatrigger.org/post/nlp_hugging_face/
|
| 69 |
# # Custom summarization pipeline (to handle long articles)
|
| 70 |
# def summarize(text, minimum_length_of_summary = 100):
|
|
|
|
| 75 |
# # Untokenize
|
| 76 |
# return([tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0])
|
| 77 |
|
|
|
|
| 78 |
# Reference: https://huggingface.co/spaces/team-zero-shot-nli/zero-shot-nli/blob/main/utils.py
|
| 79 |
@st.cache_resource
|
| 80 |
def load_model():
|
|
|
|
| 87 |
def classifier_zero(classifier, sequence:str, labels:list, multi_class:bool):
|
| 88 |
outputs = classifier(sequence, labels, multi_label=multi_class)
|
| 89 |
return outputs['labels'], outputs['scores']
|
|
|