Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from transformers import pipeline | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import os | |
| print("π Files in /app:", os.listdir()) | |
| # === Load and chunk structured .txt files ==== test | |
| def load_structured_chunks(file_path): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| print("π Raw resume text:\n", text) | |
| # Split by '== Section ==' headers | |
| raw_chunks = text.split("== ") | |
| print("π§© Raw chunks:\n", raw_chunks) | |
| chunks = [] | |
| for chunk in raw_chunks: | |
| cleaned = chunk.strip() | |
| if cleaned: | |
| # Optionally prepend section title for context | |
| lines = cleaned.splitlines() | |
| section_title = lines[0] if lines else "Untitled" | |
| section_body = "\n".join(lines[1:]).strip() | |
| if section_body: | |
| chunks.append(f"{section_title}\n{section_body}") | |
| return chunks | |
| # Load resume and extra info | |
| resume_chunks = load_structured_chunks("resume.txt") | |
| extra_chunks = load_structured_chunks("extra_info.txt") if "extra_info.txt" in __import__('os').listdir() else [] | |
| all_chunks = resume_chunks + extra_chunks | |
| print("β Resume chunks:", resume_chunks) | |
| print("β Extra info chunks:", extra_chunks) | |
| print("π Total chunks:", len(all_chunks)) | |
| # === Embed chunks === | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = embedder.encode(all_chunks) | |
| # === Create FAISS index === | |
| dimension = embeddings[0].shape[0] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| # === Retrieval function === | |
| def retrieve_context(query, top_k=3): | |
| print("π Total chunks:", len(all_chunks)) | |
| print("π¦ FAISS index size:", index.ntotal) | |
| query_embedding = embedder.encode([query]) | |
| scores, indices = index.search(query_embedding, top_k) | |
| selected_chunks = [] | |
| for i, score in zip(indices[0], scores[0]): | |
| chunk = all_chunks[i] | |
| print("π selected_chunks retrieved chunks:\n", chunk) | |
| # Skip short or noisy chunks unless query matches | |
| if len(chunk.split()) < 10 and not any(k in query.lower() for k in ["salary", "notice", "job", "current"]): | |
| continue | |
| selected_chunks.append((chunk, score)) | |
| # If nothing survives filtering, fall back to original top_k | |
| if not selected_chunks: | |
| selected_chunks = [(all_chunks[i], scores[0][j]) for j, i in enumerate(indices[0])] | |
| # Sort by score (lowest distance = best match) | |
| # print("π selected_chunks retrieved chunks:\n", selected_chunks) | |
| selected_chunks.sort(key=lambda x: x[1]) | |
| final_chunks = [chunk for chunk, _ in selected_chunks[:top_k]] | |
| print("π Final retrieved chunks:\n", final_chunks) | |
| return "\n\n".join(final_chunks) | |
| # === Load QA model === | |
| qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2") | |
| # === FastAPI setup === | |
| app = FastAPI() | |
| class Question(BaseModel): | |
| query: str | |
| async def predict(question: Question): | |
| context = retrieve_context(question.query) | |
| print("π Retrieved chunks:\n", retrieve_context(question.query)) | |
| result = qa_pipeline( | |
| question=question.query, | |
| context=context | |
| ) | |
| return {"answer": result["answer"]} | |
| def health_check(): | |
| return {"status": "Resume Q&A API is running"} |