Spaces:
Sleeping
Sleeping
Update document_generator_v2.py
Browse files- document_generator_v2.py +40 -0
document_generator_v2.py
CHANGED
|
@@ -172,6 +172,7 @@ import psycopg2
|
|
| 172 |
from datetime import datetime
|
| 173 |
import base64
|
| 174 |
from fastapi import Form
|
|
|
|
| 175 |
|
| 176 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 177 |
logger = logging.getLogger(__name__)
|
|
@@ -448,6 +449,45 @@ class MarkdownConverter:
|
|
| 448 |
markdown += "</div>"
|
| 449 |
return markdown
|
| 450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
router = APIRouter()
|
| 452 |
|
| 453 |
class JsonDocumentResponse(BaseModel):
|
|
|
|
| 172 |
from datetime import datetime
|
| 173 |
import base64
|
| 174 |
from fastapi import Form
|
| 175 |
+
from llama_parse import LlamaParse
|
| 176 |
|
| 177 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 178 |
logger = logging.getLogger(__name__)
|
|
|
|
| 449 |
markdown += "</div>"
|
| 450 |
return markdown
|
| 451 |
|
| 452 |
+
async def load_documents(documents: List[UploadFile]) -> List[str]:
|
| 453 |
+
"""
|
| 454 |
+
Load and parse documents using LlamaParse.
|
| 455 |
+
|
| 456 |
+
Args:
|
| 457 |
+
documents (List[UploadFile]): List of uploaded document files.
|
| 458 |
+
|
| 459 |
+
Returns:
|
| 460 |
+
List[str]: List of parsed document contents.
|
| 461 |
+
"""
|
| 462 |
+
parser = LlamaParse(
|
| 463 |
+
api_key=os.getenv("LLAMA_PARSE_API_KEY"),
|
| 464 |
+
result_type="markdown",
|
| 465 |
+
num_workers=4,
|
| 466 |
+
verbose=True,
|
| 467 |
+
language="en",
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
# Save uploaded files temporarily
|
| 471 |
+
temp_files = []
|
| 472 |
+
for doc in documents:
|
| 473 |
+
temp_file_path = f"/tmp/{doc.filename}"
|
| 474 |
+
with open(temp_file_path, "wb") as buffer:
|
| 475 |
+
content = await doc.read()
|
| 476 |
+
buffer.write(content)
|
| 477 |
+
temp_files.append(temp_file_path)
|
| 478 |
+
|
| 479 |
+
try:
|
| 480 |
+
# Use LlamaParse to extract content
|
| 481 |
+
parsed_documents = await parser.aload_data(temp_files)
|
| 482 |
+
documents_list = [doc.text for doc in parsed_documents]
|
| 483 |
+
return documents_list
|
| 484 |
+
finally:
|
| 485 |
+
# Clean up temporary files
|
| 486 |
+
for temp_file in temp_files:
|
| 487 |
+
os.remove(temp_file)
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
|
| 491 |
router = APIRouter()
|
| 492 |
|
| 493 |
class JsonDocumentResponse(BaseModel):
|