Spaces:

Tesneem
/

document_chunker

Sleeping

App Files Files Community

Tesneem commited on Aug 8

Commit

23a7785

verified ·

1 Parent(s): 5ece6b7

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -17

app.py CHANGED Viewed

@@ -1,61 +1,65 @@
-import streamlit as st
 import os
 import tempfile
 from pymongo import MongoClient
 from datetime import datetime
 from pathlib import Path
 from document_chunker import DocumentChunker
-from dotenv import load_dotenv
-load_dotenv()
-# MongoDB connection
-mongo_uri = os.getenv("MONGO_URI")
-db_name = os.getenv("MONGO_DB", "grant_docs")
 client = MongoClient(mongo_uri)
 st.set_page_config(page_title="Doc Chunker", layout="wide")
 st.title("📄 Document Chunker & Uploader")
 with st.sidebar:
     st.header("Settings")
-    selected_collection = st.text_input("MongoDB Collection Name", "doc_chunks_cat")
     is_grant_app = st.toggle("Is this a Grant Application?", value=True)
-    if st.button("Connect to Collection"):
-        collection = client[db_name][selected_collection]
-        st.success(f"Connected to `{selected_collection}` in `{db_name}`")
 uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
 if uploaded_file:
-    # Save file to temp path
     temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
     with open(temp_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
     st.success(f"Uploaded `{uploaded_file.name}`")
-    # Check if file already exists in collection
     modified_time = datetime.now().isoformat()
-    collection = client[db_name][selected_collection]
     if collection.find_one({"metadata.title": uploaded_file.name}):
         st.warning("⚠️ This file already exists in the collection. Skipping...")
     else:
-        st.write("⏳ Processing...")
         chunker = DocumentChunker()
-        chunks = chunker.process_document(str(temp_path)) if is_grant_app else chunker.process_document(str(temp_path))
         if chunks:
             for chunk in chunks:
                 chunk['metadata'].update({
                     "title": uploaded_file.name,
                     "uploaded_at": modified_time,
                 })
                 collection.insert_one(chunk)
             st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")
-            # Show preview
             for i, c in enumerate(chunks[:3]):
                 st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
                 st.markdown(c['text'][:400] + "...")

 import os
+import streamlit as st
 import tempfile
 from pymongo import MongoClient
 from datetime import datetime
 from pathlib import Path
 from document_chunker import DocumentChunker
+# === MongoDB connection via Hugging Face secrets ===
+mongo_uri = os.environ["MONGO_URI"]
+db_name = os.environ.get("MONGO_DB", "grant_docs")
 client = MongoClient(mongo_uri)
+db = client[db_name]
+# === Streamlit UI ===
 st.set_page_config(page_title="Doc Chunker", layout="wide")
 st.title("📄 Document Chunker & Uploader")
 with st.sidebar:
     st.header("Settings")
+    # Fetch collection names for dropdown
+    try:
+        existing_collections = db.list_collection_names()
+        selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0)
+    except Exception as e:
+        st.error(f"Failed to list collections: {e}")
+        selected_collection = "doc_chunks_cat"
     is_grant_app = st.toggle("Is this a Grant Application?", value=True)
 uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
 if uploaded_file:
     temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
     with open(temp_path, "wb") as f:
         f.write(uploaded_file.getbuffer())
     st.success(f"Uploaded `{uploaded_file.name}`")
     modified_time = datetime.now().isoformat()
+    collection = db[selected_collection]
     if collection.find_one({"metadata.title": uploaded_file.name}):
         st.warning("⚠️ This file already exists in the collection. Skipping...")
     else:
+        st.write("⏳ Processing with DocumentChunker...")
         chunker = DocumentChunker()
+        chunks = chunker.process_document(str(temp_path))
         if chunks:
             for chunk in chunks:
                 chunk['metadata'].update({
                     "title": uploaded_file.name,
                     "uploaded_at": modified_time,
+                    "is_grant_app": is_grant_app,
                 })
                 collection.insert_one(chunk)
             st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")
+            # Show a few previews
             for i, c in enumerate(chunks[:3]):
                 st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
                 st.markdown(c['text'][:400] + "...")