Spaces:
Runtime error
Runtime error
Commit
·
a5ee254
1
Parent(s):
970a7e9
Update app.py
Browse files
app.py
CHANGED
|
@@ -350,47 +350,90 @@ def load_single_example_text(
|
|
| 350 |
return text
|
| 351 |
|
| 352 |
|
| 353 |
-
def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
"""
|
| 355 |
-
|
| 356 |
|
| 357 |
-
:param
|
| 358 |
:param int max_pages: the maximum number of pages to load from a PDF
|
| 359 |
:param bool lower: whether to lowercase the text
|
| 360 |
-
:return str: the text of the
|
| 361 |
"""
|
| 362 |
global ocr_model
|
| 363 |
logger = logging.getLogger(__name__)
|
| 364 |
-
#
|
| 365 |
-
if isinstance(file_obj, list):
|
| 366 |
-
file_obj = file_obj[0]
|
| 367 |
-
file_path = Path(file_obj.name)
|
| 368 |
-
try:
|
| 369 |
-
logger.info(f"Loading file:\t{file_path}")
|
| 370 |
-
if file_path.suffix in [".txt", ".md"]:
|
| 371 |
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 372 |
-
raw_text = f.read()
|
| 373 |
-
text = clean(raw_text, lower=lower)
|
| 374 |
-
elif file_path.suffix == ".pdf":
|
| 375 |
-
logger.info(f"loading a PDF file: {file_path.name}")
|
| 376 |
-
max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
|
| 377 |
-
logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
| 378 |
-
conversion_stats = convert_PDF_to_Text(
|
| 379 |
-
file_path,
|
| 380 |
-
ocr_model=ocr_model,
|
| 381 |
-
max_pages=max_pages,
|
| 382 |
-
)
|
| 383 |
-
text = conversion_stats["converted_text"]
|
| 384 |
-
else:
|
| 385 |
-
logger.error(f"Unknown file type:\t{file_path.suffix}")
|
| 386 |
-
text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
|
| 387 |
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
|
|
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
def parse_args():
|
| 395 |
"""arguments for the command line interface"""
|
| 396 |
parser = argparse.ArgumentParser(
|
|
|
|
| 350 |
return text
|
| 351 |
|
| 352 |
|
| 353 |
+
# def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
|
| 354 |
+
# """
|
| 355 |
+
# load_uploaded_file - loads a file uploaded by the user
|
| 356 |
+
|
| 357 |
+
# :param file_obj (POTENTIALLY list): Gradio file object inside a list
|
| 358 |
+
# :param int max_pages: the maximum number of pages to load from a PDF
|
| 359 |
+
# :param bool lower: whether to lowercase the text
|
| 360 |
+
# :return str: the text of the file
|
| 361 |
+
# """
|
| 362 |
+
# global ocr_model
|
| 363 |
+
# logger = logging.getLogger(__name__)
|
| 364 |
+
# # check if mysterious file object is a list
|
| 365 |
+
# if isinstance(file_obj, list):
|
| 366 |
+
# file_obj = file_obj[0]
|
| 367 |
+
# file_path = Path(file_obj.name)
|
| 368 |
+
# try:
|
| 369 |
+
# logger.info(f"Loading file:\t{file_path}")
|
| 370 |
+
# if file_path.suffix in [".txt", ".md"]:
|
| 371 |
+
# with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 372 |
+
# raw_text = f.read()
|
| 373 |
+
# text = clean(raw_text, lower=lower)
|
| 374 |
+
# elif file_path.suffix == ".pdf":
|
| 375 |
+
# logger.info(f"loading a PDF file: {file_path.name}")
|
| 376 |
+
# max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
|
| 377 |
+
# logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
| 378 |
+
# conversion_stats = convert_PDF_to_Text(
|
| 379 |
+
# file_path,
|
| 380 |
+
# ocr_model=ocr_model,
|
| 381 |
+
# max_pages=max_pages,
|
| 382 |
+
# )
|
| 383 |
+
# text = conversion_stats["converted_text"]
|
| 384 |
+
# else:
|
| 385 |
+
# logger.error(f"Unknown file type:\t{file_path.suffix}")
|
| 386 |
+
# text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
|
| 387 |
+
|
| 388 |
+
# return text
|
| 389 |
+
# except Exception as e:
|
| 390 |
+
# logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
| 391 |
+
# return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
def load_uploaded_files(file_objs, max_pages: int = 20, lower: bool = False) -> str:
|
| 395 |
"""
|
| 396 |
+
load_uploaded_files - loads multiple files uploaded by the user and concatenates their contents
|
| 397 |
|
| 398 |
+
:param file_objs (list): List of Gradio file objects
|
| 399 |
:param int max_pages: the maximum number of pages to load from a PDF
|
| 400 |
:param bool lower: whether to lowercase the text
|
| 401 |
+
:return str: the concatenated text of all the files
|
| 402 |
"""
|
| 403 |
global ocr_model
|
| 404 |
logger = logging.getLogger(__name__)
|
| 405 |
+
concatenated_text = "" # Initialize an empty string to concatenate text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
+
try:
|
| 408 |
+
for file_obj in file_objs:
|
| 409 |
+
file_path = Path(file_obj.name)
|
| 410 |
+
logger.info(f"Loading file:\t{file_path}")
|
| 411 |
+
|
| 412 |
+
if file_path.suffix in [".txt", ".md"]:
|
| 413 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 414 |
+
raw_text = f.read()
|
| 415 |
+
text = clean(raw_text, lower=lower)
|
| 416 |
+
elif file_path.suffix == ".pdf":
|
| 417 |
+
logger.info(f"loading a PDF file: {file_path.name}")
|
| 418 |
+
max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
|
| 419 |
+
logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
| 420 |
+
conversion_stats = convert_PDF_to_Text(
|
| 421 |
+
file_path,
|
| 422 |
+
ocr_model=ocr_model,
|
| 423 |
+
max_pages=max_pages,
|
| 424 |
+
)
|
| 425 |
+
text = conversion_stats["converted_text"]
|
| 426 |
+
else:
|
| 427 |
+
logger.error(f"Unknown file type:\t{file_path.suffix}")
|
| 428 |
+
text = f"ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
|
| 429 |
|
| 430 |
+
concatenated_text += text # Concatenate text from each file
|
| 431 |
|
| 432 |
+
return concatenated_text
|
| 433 |
+
except Exception as e:
|
| 434 |
+
logger.error(f"Error: {e}")
|
| 435 |
+
return f"Error: Could not read one or more files. Make sure they are PDF, TXT, or MD files."
|
| 436 |
+
|
| 437 |
def parse_args():
|
| 438 |
"""arguments for the command line interface"""
|
| 439 |
parser = argparse.ArgumentParser(
|