Spaces:
Runtime error
Runtime error
Kushwanth Chowday Kandala
commited on
add combine_text functionality prep to chunk the data with the model limits
Browse files
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import pandas as pd
|
|
| 6 |
from io import StringIO
|
| 7 |
import PyPDF2
|
| 8 |
from tqdm import tqdm
|
|
|
|
| 9 |
# import json
|
| 10 |
|
| 11 |
# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
|
|
@@ -142,6 +143,16 @@ def print_out(pages):
|
|
| 142 |
text = pages[i].extract_text().strip()
|
| 143 |
st.write(f"Page {i} : {text}")
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
with st.sidebar:
|
| 146 |
st.markdown("""
|
| 147 |
***Follow this steps***
|
|
@@ -170,4 +181,4 @@ with st.sidebar:
|
|
| 170 |
reader = PyPDF2.PdfReader(uploaded_file)
|
| 171 |
pages = reader.pages
|
| 172 |
print_out(pages)
|
| 173 |
-
|
|
|
|
| 6 |
from io import StringIO
|
| 7 |
import PyPDF2
|
| 8 |
from tqdm import tqdm
|
| 9 |
+
import math
|
| 10 |
# import json
|
| 11 |
|
| 12 |
# st.config(PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION="python")
|
|
|
|
| 143 |
text = pages[i].extract_text().strip()
|
| 144 |
st.write(f"Page {i} : {text}")
|
| 145 |
|
| 146 |
+
def combine_text(pages):
|
| 147 |
+
concatenates_text = ""
|
| 148 |
+
for page in tqdm(pages):
|
| 149 |
+
text = page.extract_text().strip()
|
| 150 |
+
concatenates_text += text
|
| 151 |
+
bytesize = bytes(text, "utf-8")
|
| 152 |
+
p = math.pow(1024, 2)
|
| 153 |
+
mbsize = round(bytesize / p, 2)
|
| 154 |
+
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
|
| 155 |
+
|
| 156 |
with st.sidebar:
|
| 157 |
st.markdown("""
|
| 158 |
***Follow this steps***
|
|
|
|
| 181 |
reader = PyPDF2.PdfReader(uploaded_file)
|
| 182 |
pages = reader.pages
|
| 183 |
print_out(pages)
|
| 184 |
+
combine_text(pages)
|