Update functions.py
Browse files- functions.py +26 -0
functions.py
CHANGED
|
@@ -136,6 +136,32 @@ def process_corpus(corpus, _tokenizer, title, embedding_model, chunk_size=200, o
|
|
| 136 |
|
| 137 |
return docsearch
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
@st.experimental_singleton(suppress_st_warning=True)
|
| 140 |
def gen_embeddings(embedding_model):
|
| 141 |
|
|
|
|
| 136 |
|
| 137 |
return docsearch
|
| 138 |
|
| 139 |
+
@st.experimental_singleton(suppress_st_warning=True)
|
| 140 |
+
def chunk_and_preprocess_text(text,thresh=500):
|
| 141 |
+
|
| 142 |
+
"""Chunk text longer than n tokens for summarization"""
|
| 143 |
+
|
| 144 |
+
sentences = sent_tokenize(clean_text(text))
|
| 145 |
+
#sentences = [i.text for i in list(article.sents)]
|
| 146 |
+
|
| 147 |
+
current_chunk = 0
|
| 148 |
+
chunks = []
|
| 149 |
+
|
| 150 |
+
for sentence in sentences:
|
| 151 |
+
if len(chunks) == current_chunk + 1:
|
| 152 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= thresh:
|
| 153 |
+
chunks[current_chunk].extend(sentence.split(" "))
|
| 154 |
+
else:
|
| 155 |
+
current_chunk += 1
|
| 156 |
+
chunks.append(sentence.split(" "))
|
| 157 |
+
else:
|
| 158 |
+
chunks.append(sentence.split(" "))
|
| 159 |
+
|
| 160 |
+
for chunk_id in range(len(chunks)):
|
| 161 |
+
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 162 |
+
|
| 163 |
+
return chunks
|
| 164 |
+
|
| 165 |
@st.experimental_singleton(suppress_st_warning=True)
|
| 166 |
def gen_embeddings(embedding_model):
|
| 167 |
|