Nihal2000 commited on
Commit
3e772ec
·
1 Parent(s): eb60db5

First vesrion

Browse files
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import gradio as gr
4
+ from gradio import components
5
+ from fastmcp import FastMCP
6
+ # from core.parser import parse_document, parse_url
7
+ from core.parser import parse_document, parse_url
8
+ from core.summarizer import summarize_content, tag_content
9
+ from core.storage import add_document, search_documents
10
+ from core.agent import answer_question
11
+ # from core.components import DocumentViewer
12
+ import plotly.graph_objects as go
13
+
14
+ # Initialize the FastMCP server (for agentic tools)
15
+ mcp = FastMCP("IntelligentContentOrganizer")
16
+
17
+ # Gradio UI functions
18
+ def process_content(file_obj, url, tags_input):
19
+ """
20
+ Handle file upload or URL input: parse content, summarize, tag, store.
21
+ """
22
+ content_text = ""
23
+ source = ""
24
+ if file_obj is not None:
25
+ # Save uploaded file to temp path
26
+ file_path = file_obj.name
27
+ content_text = parse_document(file_path)
28
+ source = file_obj.name
29
+ elif url:
30
+ content_text = parse_url(url)
31
+ source = url
32
+ else:
33
+ return "No document provided.", "", "", ""
34
+
35
+ # Summarize and tag (simulated)
36
+ summary = summarize_content(content_text)
37
+ tags = tag_content(content_text)
38
+
39
+ # Allow user to override or confirm tags via input
40
+ if tags_input:
41
+ # If user entered new tags, split by comma
42
+ tags = [t.strip() for t in tags_input.split(",") if t.strip() != ""]
43
+
44
+ # Store in ChromaDB with a unique ID
45
+ doc_id = str(uuid.uuid4())
46
+ metadata = {"source": source, "tags": tags}
47
+ add_document(doc_id, content_text, metadata)
48
+
49
+ return content_text, summary, ", ".join(tags), f"Document stored with ID: {doc_id}"
50
+
51
+ def generate_graph():
52
+ """
53
+ Create a simple Plotly graph of documents.
54
+ Nodes = documents, edges = shared tags.
55
+ """
56
+ # Fetch all documents from ChromaDB
57
+ from core.storage import get_all_documents
58
+ docs = get_all_documents()
59
+ if not docs:
60
+ return go.Figure() # empty
61
+
62
+ # Build graph connections: if two docs share a tag, connect them
63
+ nodes = {doc["id"]: doc for doc in docs}
64
+ edges = []
65
+ for i, doc1 in enumerate(docs):
66
+ for doc2 in docs[i+1:]:
67
+ shared_tags = set(doc1["metadata"]["tags"]) & set(doc2["metadata"]["tags"])
68
+ if shared_tags:
69
+ edges.append((doc1["id"], doc2["id"]))
70
+
71
+ # Use networkx to compute layout (or simple fixed positions)
72
+ import networkx as nx
73
+ G = nx.Graph()
74
+ G.add_nodes_from(nodes.keys())
75
+ G.add_edges_from(edges)
76
+ pos = nx.spring_layout(G, seed=42)
77
+
78
+ # Create Plotly traces
79
+ edge_x = []
80
+ edge_y = []
81
+ for (src, dst) in edges:
82
+ x0, y0 = pos[src]
83
+ x1, y1 = pos[dst]
84
+ edge_x += [x0, x1, None]
85
+ edge_y += [y0, y1, None]
86
+ edge_trace = go.Scatter(
87
+ x=edge_x, y=edge_y,
88
+ line=dict(width=1, color='#888'),
89
+ hoverinfo='none',
90
+ mode='lines')
91
+
92
+ node_x = []
93
+ node_y = []
94
+ node_text = []
95
+ for node_id in G.nodes():
96
+ x, y = pos[node_id]
97
+ node_x.append(x)
98
+ node_y.append(y)
99
+ text = nodes[node_id]["metadata"].get("source", "")
100
+ node_text.append(f"{text}\nTags: {nodes[node_id]['metadata']['tags']}")
101
+
102
+ node_trace = go.Scatter(
103
+ x=node_x, y=node_y,
104
+ mode='markers+text',
105
+ marker=dict(size=10, color='skyblue'),
106
+ text=node_text, hoverinfo='text', textposition="bottom center")
107
+
108
+ fig = go.Figure(data=[edge_trace, node_trace],
109
+ layout=go.Layout(title="Document Knowledge Graph",
110
+ showlegend=False,
111
+ margin=dict(l=20, r=20, b=20, t=30)))
112
+ return fig
113
+
114
+ def handle_query(question):
115
+ """
116
+ Answer a user question by retrieving relevant documents and summarizing them.
117
+ """
118
+ if not question:
119
+ return "Please enter a question."
120
+
121
+ answer = answer_question(question)
122
+ return answer
123
+
124
+ # Build Gradio interface with Blocks
125
+ with gr.Blocks(title="Intelligent Content Organizer") as demo:
126
+ gr.Markdown("# Intelligent Content Organizer")
127
+ with gr.Tab("Upload / Fetch Content"):
128
+ gr.Markdown("**Add a document:** Upload a file or enter a URL.")
129
+ with gr.Row():
130
+ file_in = gr.File(label="Upload Document (PDF, TXT, etc.)")
131
+ url_in = gr.Textbox(label="Document URL", placeholder="https://example.com/article")
132
+ tags_in = gr.Textbox(label="Tags (comma-separated)", placeholder="Enter tags or leave blank")
133
+ process_btn = gr.Button("Parse & Add Document")
134
+ doc_view = gr.Textbox(label="Document Preview", lines=10, interactive=False)
135
+ summary_out = gr.Textbox(label="Summary", interactive=False)
136
+ tags_out = gr.Textbox(label="Detected Tags", interactive=False)
137
+ status_out = gr.Textbox(label="Status/Info", interactive=False)
138
+ process_btn.click(fn=process_content, inputs=[file_in, url_in, tags_in],
139
+ outputs=[doc_view, summary_out, tags_out, status_out])
140
+
141
+ with gr.Tab("Knowledge Graph"):
142
+ gr.Markdown("**Document relationships:** Shared tags indicate edges.")
143
+ graph_plot = gr.Plot(label="Knowledge Graph")
144
+ refresh_btn = gr.Button("Refresh Graph")
145
+ refresh_btn.click(fn=generate_graph, inputs=None, outputs=graph_plot)
146
+
147
+ with gr.Tab("Ask a Question"):
148
+ gr.Markdown("**AI Q&A:** Ask a question about your documents.")
149
+ question_in = gr.Textbox(label="Your Question")
150
+ answer_out = gr.Textbox(label="Answer", interactive=False)
151
+ ask_btn = gr.Button("Get Answer")
152
+ ask_btn.click(fn=handle_query, inputs=question_in, outputs=answer_out)
153
+
154
+ if __name__ == "__main__":
155
+ # Launch Gradio app (Hugging Face Spaces will auto-launch this)
156
+ # demo.queue().launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)))
157
+ demo.launch(mcp_server=True)
config.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv() # loads from .env if present
5
+ MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
6
+ CLAUDE_API_KEY = os.environ.get("CLAUDE_API_KEY")
7
+ BRAVE_API_KEY = os.environ.get("BRAVE_API_KEY")
core/__init__.py ADDED
File without changes
core/agent.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from core.storage import search_documents
3
+ # For Q&A we can use a simple retrieval + QA pipeline (stubbed here)
4
+ # In a real app, you might use LangChain or a HuggingFace question-answering model.
5
+
6
+ def answer_question(question: str) -> str:
7
+ """
8
+ Agent: retrieve relevant docs and answer the question.
9
+ """
10
+ # Retrieve top documents
11
+ results = search_documents(question, top_k=3)
12
+ doc_texts = results.get("documents", [[]])[0]
13
+ combined = " ".join(doc_texts)
14
+ # Stub: just echo the question and number of docs
15
+ if not combined.strip():
16
+ return "No relevant documents found."
17
+ return f"Answered question: '{question}' (based on {len(doc_texts)} documents)."
core/ai_enrichment.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/ai_enrichment.py
2
+
3
+ from mistralai import Mistral
4
+ import config
5
+
6
+ def generate_tags(text: str) -> list[str]:
7
+ """
8
+ Use Mistral AI to generate 5-7 relevant tags for the text.
9
+ """
10
+ with Mistral(api_key=config.MISTRAL_API_KEY) as client:
11
+ response = client.chat.complete(
12
+ model="mistral-small-latest",
13
+ messages=[{
14
+ "role": "user",
15
+ "content": f"Generate 5-7 relevant tags (comma-separated) for the following text:\n\n{text}"
16
+ }]
17
+ )
18
+ try:
19
+ content = response["choices"][0]["message"]["content"]
20
+ except (KeyError, IndexError):
21
+ return []
22
+ tags = [tag.strip() for tag in content.split(",") if tag.strip()]
23
+ return tags
24
+
25
+ def summarize_text(text: str) -> str:
26
+ """
27
+ Use Mistral AI to generate a concise summary of the text.
28
+ """
29
+ with Mistral(api_key=config.MISTRAL_API_KEY) as client:
30
+ response = client.chat.complete(
31
+ model="mistral-small-latest",
32
+ messages=[{
33
+ "role": "user",
34
+ "content": f"Summarize the following text in a concise manner:\n\n{text}"
35
+ }]
36
+ )
37
+ try:
38
+ summary = response["choices"][0]["message"]["content"].strip()
39
+ except (KeyError, IndexError):
40
+ return ""
41
+ return summary
core/components.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ class DocumentViewer(gr.components.Component):
4
+ """
5
+ Custom Gradio component for document preview and tag editing.
6
+ (Stub implementation)
7
+ """
8
+ def __init__(self, label=None):
9
+ super().__init__(label=label, value=None)
10
+ self.visible = True
11
+ self.interactive = False
12
+
13
+ def preprocess(self, x):
14
+ # Input is a file path (or object); just return as-is
15
+ return x
16
+
17
+ def postprocess(self, x):
18
+ # x is the raw document text; display first few lines as preview
19
+ if not x:
20
+ return ""
21
+ lines = x.splitlines()
22
+ preview = "\n".join(lines[:10])
23
+ return preview
core/components.pyi ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio.events import Dependency
3
+
4
+ class DocumentViewer(gr.components.Component):
5
+ """
6
+ Custom Gradio component for document preview and tag editing.
7
+ (Stub implementation)
8
+ """
9
+ def __init__(self, label=None):
10
+ super().__init__(label=label, value=None)
11
+ self.visible = True
12
+ self.interactive = False
13
+
14
+ def preprocess(self, x):
15
+ # Input is a file path (or object); just return as-is
16
+ return x
17
+
18
+ def postprocess(self, x):
19
+ # x is the raw document text; display first few lines as preview
20
+ if not x:
21
+ return ""
22
+ lines = x.splitlines()
23
+ preview = "\n".join(lines[:10])
24
+ return preview
25
+ from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING
26
+ from gradio.blocks import Block
27
+ if TYPE_CHECKING:
28
+ from gradio.components import Timer
29
+ from gradio.components.base import Component
core/database.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/database.py
2
+
3
+ import chromadb
4
+ from chromadb.config import Settings
5
+ from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
6
+ import config
7
+
8
+ def init_chroma():
9
+ """
10
+ Initialize a ChromaDB client and collection with an embedding function.
11
+ Uses OpenAI embeddings if API key is available, otherwise a dummy embedding.
12
+ """
13
+ # Initialize Chroma client (in-memory by default)
14
+ client = chromadb.Client(Settings())
15
+
16
+ # Determine embedding function
17
+ embedding_fn = None
18
+ try:
19
+ openai_key = config.OPENAI_API_KEY
20
+ except AttributeError:
21
+ openai_key = None
22
+
23
+ if openai_key:
24
+ embedding_fn = OpenAIEmbeddingFunction(
25
+ api_key=openai_key,
26
+ model_name="text-embedding-ada-002"
27
+ )
28
+ else:
29
+ # Dummy embedding: one-dimensional embedding based on text length
30
+ class DummyEmbedding:
31
+ def __call__(self, texts):
32
+ return [[float(len(text))] for text in texts]
33
+ embedding_fn = DummyEmbedding()
34
+
35
+ # Create or get collection named "documents"
36
+ collection = client.get_or_create_collection(
37
+ name="documents",
38
+ embedding_function=embedding_fn
39
+ )
40
+ return collection
41
+
42
+ def add_document(collection, doc_id: str, text: str, tags: list[str], summary: str, source: str):
43
+ """
44
+ Add a document to the ChromaDB collection with metadata.
45
+ """
46
+ metadata = {"tags": tags, "summary": summary, "source": source}
47
+ # Add document (Chroma will generate embeddings using the collection's embedding function)
48
+ collection.add(
49
+ ids=[doc_id],
50
+ documents=[text],
51
+ metadatas=[metadata]
52
+ )
53
+
54
+ def search_documents(collection, query: str, top_n: int = 5) -> list[dict]:
55
+ """
56
+ Search for semantically similar documents in the collection.
57
+ Returns top N results with their metadata.
58
+ """
59
+ results = collection.query(
60
+ query_texts=[query],
61
+ n_results=top_n,
62
+ include=["metadatas", "documents", "distances"]
63
+ )
64
+ hits = []
65
+ # Extract the results from the Chroma query response
66
+ ids = results.get("ids", [[]])[0]
67
+ documents = results.get("documents", [[]])[0]
68
+ metadatas = results.get("metadatas", [[]])[0]
69
+ distances = results.get("distances", [[]])[0]
70
+
71
+ for i, doc_id in enumerate(ids):
72
+ hit = {
73
+ "id": doc_id,
74
+ "score": distances[i] if i < len(distances) else None,
75
+ "source": metadatas[i].get("source") if i < len(metadatas) else None,
76
+ "tags": metadatas[i].get("tags") if i < len(metadatas) else None,
77
+ "summary": metadatas[i].get("summary") if i < len(metadatas) else None,
78
+ "document": documents[i] if i < len(documents) else None
79
+ }
80
+ hits.append(hit)
81
+ return hits
core/parser.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from unstructured.partition.auto import partition
4
+
5
+ def parse_document(file_path: str) -> str:
6
+ """
7
+ Parse a document file (PDF, DOCX, TXT, etc.) into text using Unstructured.
8
+ """
9
+ try:
10
+ elements = partition(file_path)
11
+ # Combine text elements into a single string
12
+ text = "\n".join([elem.text for elem in elements if elem.text])
13
+ return text
14
+ except Exception as e:
15
+ return f"Error parsing document: {e}"
16
+
17
+ def parse_url(url: str) -> str:
18
+ """
19
+ Fetch and parse webpage content at the given URL.
20
+ """
21
+ try:
22
+ headers = {"User-Agent": "Mozilla/5.0"}
23
+ response = requests.get(url, headers=headers, timeout=10)
24
+ soup = BeautifulSoup(response.text, 'html.parser')
25
+ # Extract visible text from paragraphs
26
+ paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'li'])
27
+ text = "\n".join([p.get_text() for p in paragraphs])
28
+ return text
29
+ except Exception as e:
30
+ return f"Error fetching URL: {e}"
core/processing.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/processing.py
2
+
3
+ import requests
4
+ from unstructured.partition.html import partition_html
5
+ from unstructured.partition.auto import partition
6
+ import config
7
+
8
+ def fetch_web_content(url: str) -> str:
9
+ """
10
+ Fetch and parse web content from the given URL into structured text.
11
+ """
12
+ try:
13
+ # Use Unstructured to fetch and parse HTML content directly from the URL
14
+ elements = partition_html(url=url)
15
+ text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
16
+ return text
17
+ except Exception:
18
+ # If Unstructured parsing fails, attempt a simple HTTP GET as a fallback
19
+ try:
20
+ response = requests.get(url)
21
+ response.raise_for_status()
22
+ html_text = response.text
23
+ # Attempt parsing the fetched HTML text
24
+ elements = partition(filename=None, file=html_text)
25
+ text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
26
+ return text
27
+ except Exception:
28
+ # On failure, return empty string
29
+ return ""
30
+
31
+ def parse_local_file(file_path: str) -> str:
32
+ """
33
+ Parse a local file into structured text using the Unstructured library.
34
+ Supports various file formats (e.g., PDF, DOCX, TXT).
35
+ """
36
+ try:
37
+ elements = partition(filename=file_path)
38
+ text = "\n\n".join([elem.text for elem in elements if hasattr(elem, 'text') and elem.text])
39
+ return text
40
+ except Exception:
41
+ # Return empty string on failure
42
+ return ""
core/storage.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ import os
3
+ from mistralai import Mistral
4
+ import config
5
+
6
+ # Initialize ChromaDB client (persistent directory can be set via CHROMA_DB_DIR)
7
+ chroma_db_path = os.getenv("CHROMA_DB_DIR", "db/")
8
+ client = chromadb.Client()
9
+ collection = client.get_or_create_collection("documents")
10
+
11
+ # Use Mistral API for embeddings
12
+
13
+ def get_mistral_embedding(text: str) -> list[float]:
14
+ """
15
+ Get embedding for the given text using Mistral API.
16
+ """
17
+ with Mistral(api_key=config.MISTRAL_API_KEY) as client:
18
+ response = client.embeddings.create(
19
+ model="mistral-embed",
20
+ input=text
21
+ )
22
+ # The API returns a list of embeddings (one per input)
23
+ return response['data'][0]['embedding']
24
+
25
+
26
+ def add_document(doc_id: str, text: str, metadata: dict):
27
+ """
28
+ Add a document's text and metadata to the ChromaDB collection.
29
+ """
30
+ embedding = get_mistral_embedding(text)
31
+ collection.add(ids=[doc_id], embeddings=[embedding], documents=[text], metadatas=[metadata])
32
+ # Persist to disk
33
+ client.persist()
34
+ return True
35
+
36
+
37
+ def search_documents(query: str, top_k: int = 5) -> dict:
38
+ """
39
+ Search for documents semantically similar to the query.
40
+ Returns a dictionary of top results.
41
+ """
42
+ query_vec = get_mistral_embedding(query)
43
+ results = collection.query(query_embeddings=[query_vec], n_results=top_k,
44
+ include=['ids','distances','documents','metadatas'])
45
+ return results
46
+
47
+
48
+ def get_all_documents() -> list:
49
+ """
50
+ Retrieve metadata for all documents in the collection.
51
+ """
52
+ all_ids = collection.get()['ids']
53
+ docs = []
54
+ for doc_id in all_ids:
55
+ res = collection.get(ids=[doc_id])
56
+ if res and res['metadatas']:
57
+ docs.append({"id": doc_id, "metadata": res['metadatas'][0]})
58
+ return docs
core/summarizer.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def summarize_content(text: str) -> str:
2
+ """
3
+ Generate a summary of the text. (This is a stub simulating a Claude 3 Haiku call.)
4
+ """
5
+ # In a real app, you might call the Anthropic Claude 3 API here.
6
+ # We'll return the first 100 characters as a "summary".
7
+ summary = text.strip().replace("\n", " ")
8
+ summary = summary[:100] + ("..." if len(summary) > 100 else "")
9
+ return f"Summary: {summary}"
10
+
11
+ def tag_content(text: str) -> list:
12
+ """
13
+ Generate tags for the text. (This is a stub simulating a Mistral 7B call.)
14
+ """
15
+ # In a real app, you might call a tag-generation model or use embeddings.
16
+ # We'll simulate by picking some keywords.
17
+ common_words = ["data", "analysis", "python", "research", "AI"]
18
+ tags = []
19
+ lower = text.lower()
20
+ for word in common_words:
21
+ if word in lower:
22
+ tags.append(word)
23
+ if not tags:
24
+ tags = ["general"]
25
+ return tags
core/utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/utils.py
2
+
3
+ import re
4
+ from datetime import datetime
5
+ import hashlib
6
+
7
+ def clean_text(text: str) -> str:
8
+ """
9
+ Clean and normalize text by removing extra whitespace.
10
+ """
11
+ if not text:
12
+ return ""
13
+ # Collapse multiple whitespace into single spaces and strip ends
14
+ cleaned = re.sub(r'\s+', ' ', text)
15
+ return cleaned.strip()
16
+
17
+ def generate_doc_id(source: str) -> str:
18
+ """
19
+ Generate a unique document ID based on source identifier and timestamp.
20
+ """
21
+ timestamp = datetime.now().isoformat()
22
+ raw_id = f"{source}-{timestamp}"
23
+ return hashlib.md5(raw_id.encode()).hexdigest()
data/article_url.txt ADDED
File without changes
data/document1.pdf ADDED
File without changes
data/sample_note.txt ADDED
File without changes
mcp_tools.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # # mcp_tools.py
2
+
3
+ # from fastmcp import FastMCP
4
+ # import core.processing as processing
5
+ # import core.ai_enrichment as ai_enrichment
6
+ # import core.database as db
7
+ # import core.utils as utils
8
+
9
+ # # Initialize the FastMCP server instance
10
+ # mcp = FastMCP(name="IntelligentContentOrganizer")
11
+
12
+ # # Initialize the ChromaDB collection (shared for all tools)
13
+ # collection = db.init_chroma()
14
+
15
+ # @mcp.tool()
16
+ # def process_content(url: str) -> dict:
17
+ # """
18
+ # Process content from a web URL: fetch, enrich, and store.
19
+ # Returns document ID, tags, summary, and source.
20
+ # """
21
+ # content = processing.fetch_web_content(url)
22
+ # text = utils.clean_text(content)
23
+ # tags = ai_enrichment.generate_tags(text) if text else []
24
+ # summary = ai_enrichment.summarize_text(text) if text else ""
25
+ # doc_id = utils.generate_doc_id(url)
26
+ # # Add the document to the database collection
27
+ # db.add_document(collection, doc_id, text, tags, summary, source=url)
28
+ # return {"id": doc_id, "tags": tags, "summary": summary, "source": url}
29
+
30
+ # @mcp.tool()
31
+ # def upload_local_file(file_path: str) -> dict:
32
+ # """
33
+ # Process a local file: parse, enrich, and store.
34
+ # Returns document ID, tags, summary, and source.
35
+ # """
36
+ # content = processing.parse_local_file(file_path)
37
+ # text = utils.clean_text(content)
38
+ # tags = ai_enrichment.generate_tags(text) if text else []
39
+ # summary = ai_enrichment.summarize_text(text) if text else ""
40
+ # doc_id = utils.generate_doc_id(file_path)
41
+ # db.add_document(collection, doc_id, text, tags, summary, source=file_path)
42
+ # return {"id": doc_id, "tags": tags, "summary": summary, "source": file_path}
43
+
44
+ # @mcp.tool()
45
+ # def semantic_search(query: str, top_n: int = 5) -> list:
46
+ # """
47
+ # Search for documents semantically similar to the query.
48
+ # Returns top N results as a list of dictionaries.
49
+ # """
50
+ # results = db.search_documents(collection, query, top_n)
51
+ # return results
52
+
53
+
54
+ from fastmcp import FastMCP
55
+ from core.parser import parse_document, parse_url
56
+ from core.summarizer import summarize_content, tag_content
57
+ from core.storage import add_document, search_documents
58
+ from core.agent import answer_question
59
+ import json
60
+
61
+ mcp = FastMCP("IntelligentContentOrganizer_MCP")
62
+
63
+ @mcp.tool(name="parse_document")
64
+ def mcp_parse_document(file_path: str) -> str:
65
+ """
66
+ MCP tool: Parse a document file and return extracted text.
67
+ """
68
+ text = parse_document(file_path)
69
+ return text
70
+
71
+ @mcp.tool(name="parse_url")
72
+ def mcp_parse_url(url: str) -> str:
73
+ """
74
+ MCP tool: Fetch and parse webpage content from a URL.
75
+ """
76
+ text = parse_url(url)
77
+ return text
78
+
79
+ @mcp.tool(name="summarize")
80
+ def mcp_summarize(text: str) -> str:
81
+ """
82
+ MCP tool: Generate a summary of the provided text.
83
+ """
84
+ return summarize_content(text)
85
+
86
+ @mcp.tool(name="tag")
87
+ def mcp_tag(text: str) -> str:
88
+ """
89
+ MCP tool: Generate tags for the provided text (JSON list).
90
+ """
91
+ tags = tag_content(text)
92
+ return json.dumps(tags)
93
+
94
+ @mcp.tool(name="add_to_db")
95
+ def mcp_add_to_db(doc_id: str, text: str, metadata_json: str) -> str:
96
+ """
97
+ MCP tool: Add a document to ChromaDB with given ID and metadata (JSON).
98
+ """
99
+ metadata = json.loads(metadata_json)
100
+ add_document(doc_id, text, metadata)
101
+ return "Document added with ID: " + doc_id
102
+
103
+ @mcp.tool(name="search_db")
104
+ def mcp_search_db(query: str, top_k: int = 5) -> str:
105
+ """
106
+ MCP tool: Search documents using a query (semantic search). Returns JSON results.
107
+ """
108
+ results = search_documents(query, top_k=top_k)
109
+ return json.dumps(results)
110
+
111
+ @mcp.tool(name="answer_question")
112
+ def mcp_answer_question(question: str) -> str:
113
+ """
114
+ MCP tool: Answer a question using the agentic workflow.
115
+ """
116
+ answer = answer_question(question)
117
+ return answer
118
+
119
+ if __name__ == "__main__":
120
+ # Run the MCP server (streamable HTTP for web integration:contentReference[oaicite:6]{index=6})
121
+ mcp.run(transport="streamable-http", host="0.0.0.0", port=7861, path="/mcp")
122
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ mistralai
3
+ python-dotenv
4
+ gradio>=4.0
5
+ fastmcp>=2.0
6
+ chromadb
7
+ sentence-transformers
8
+ unstructured
9
+ requests
10
+ beautifulsoup4
11
+ plotly
12
+ networkx