Spaces:
Running
Running
File size: 3,388 Bytes
0f50b81 5aa5ba7 d7be225 72fbd66 58e0a60 d7be225 4fe402e 72fbd66 58e0a60 d7be225 72fbd66 58e0a60 72fbd66 58e0a60 d1302b7 72fbd66 d1302b7 72fbd66 d1302b7 72fbd66 cee126a d1302b7 cee126a 72fbd66 d352740 72fbd66 5aa5ba7 d7be225 5aa5ba7 72fbd66 74255a6 2ae3da8 72fbd66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import os
print("π Files in /app:", os.listdir())
# === Load and chunk structured .txt files ==== test
def load_structured_chunks(file_path):
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
print("π Raw resume text:\n", text)
# Split by '== Section ==' headers
raw_chunks = text.split("== ")
print("π§© Raw chunks:\n", raw_chunks)
chunks = []
for chunk in raw_chunks:
cleaned = chunk.strip()
if cleaned:
# Optionally prepend section title for context
lines = cleaned.splitlines()
section_title = lines[0] if lines else "Untitled"
section_body = "\n".join(lines[1:]).strip()
if section_body:
chunks.append(f"{section_title}\n{section_body}")
return chunks
# Load resume and extra info
resume_chunks = load_structured_chunks("resume.txt")
extra_chunks = load_structured_chunks("extra_info.txt") if "extra_info.txt" in __import__('os').listdir() else []
all_chunks = resume_chunks + extra_chunks
print("β
Resume chunks:", resume_chunks)
print("β
Extra info chunks:", extra_chunks)
print("π Total chunks:", len(all_chunks))
# === Embed chunks ===
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(all_chunks)
# === Create FAISS index ===
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# === Retrieval function ===
def retrieve_context(query, top_k=3):
print("π Total chunks:", len(all_chunks))
print("π¦ FAISS index size:", index.ntotal)
query_embedding = embedder.encode([query])
scores, indices = index.search(query_embedding, top_k)
selected_chunks = []
for i, score in zip(indices[0], scores[0]):
chunk = all_chunks[i]
print("π selected_chunks retrieved chunks:\n", chunk)
# Skip short or noisy chunks unless query matches
if len(chunk.split()) < 10 and not any(k in query.lower() for k in ["salary", "notice", "job", "current"]):
continue
selected_chunks.append((chunk, score))
# If nothing survives filtering, fall back to original top_k
if not selected_chunks:
selected_chunks = [(all_chunks[i], scores[0][j]) for j, i in enumerate(indices[0])]
# Sort by score (lowest distance = best match)
# print("π selected_chunks retrieved chunks:\n", selected_chunks)
selected_chunks.sort(key=lambda x: x[1])
final_chunks = [chunk for chunk, _ in selected_chunks[:top_k]]
print("π Final retrieved chunks:\n", final_chunks)
return "\n\n".join(final_chunks)
# === Load QA model ===
qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2")
# === FastAPI setup ===
app = FastAPI()
class Question(BaseModel):
query: str
@app.post("/predict")
async def predict(question: Question):
context = retrieve_context(question.query)
print("π Retrieved chunks:\n", retrieve_context(question.query))
result = qa_pipeline(
question=question.query,
context=context
)
return {"answer": result["answer"]}
@app.get("/")
def health_check():
return {"status": "Resume Q&A API is running"} |