Spaces:
Sleeping
Sleeping
| import re | |
| import fitz | |
| import numpy as np | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer | |
| import spacy | |
| # Load the English NLP model and the SentenceTransformer model | |
| nlp = spacy.load("en_core_web_sm") | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| def extract_text(file): | |
| if file.name.endswith(".pdf"): | |
| doc = fitz.open(stream=file.read(), filetype="pdf") | |
| return "\n".join([page.get_text() for page in doc]) | |
| else: | |
| return file.read().decode("utf-8") | |
| def anonymize_text(text): | |
| doc = nlp(text) | |
| #Collect spaCy-detected entities | |
| replacements = [] | |
| for ent in doc.ents: | |
| if ent.label_ == "PERSON": | |
| replacements.append((ent.start_char, ent.end_char, "PERSON")) | |
| elif ent.label_ == "DATE": | |
| replacements.append((ent.start_char, ent.end_char, "DATE")) | |
| elif ent.label_ in ["GPE", "LOC"]: | |
| replacements.append((ent.start_char, ent.end_char, "LOCATION")) | |
| elif ent.label_ == "ORG": | |
| replacements.append((ent.start_char, ent.end_char, "ORG")) | |
| #Add regex-based matches for things spaCy misses | |
| regex_patterns = [ | |
| (r"\b[\w\.-]+@[\w\.-]+\.\w+\b", "EMAIL"), # Email | |
| (r"https?://\S+|www\.\S+", "URL"), # URLs | |
| (r"\b\d{10}\b", "PHONE"), # 10-digit phone numbers | |
| (r"\b[A-Z]{2,}\d{6,}\b", "ID"), # Generic IDs (e.g., AA123456) | |
| ] | |
| for pattern, label in regex_patterns: | |
| for match in re.finditer(pattern, text): | |
| replacements.append((match.start(), match.end(), label)) | |
| replacements.sort(reverse=True) | |
| for start, end, replacement in replacements: | |
| text = text[:start] + f"[{replacement}]" + text[end:] # Adding brackets for clarity | |
| return text | |
| def score_synopsis(article, synopsis): | |
| embeddings = model.encode([article, synopsis]) | |
| similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0] | |
| content_coverage = similarity * 50 | |
| clarity = (len(set(synopsis.split())) / max(len(synopsis.split()), 1)) * 25 | |
| coherence = min(25, 5 * (len(synopsis.split(".")) - 1)) | |
| total = content_coverage + clarity + coherence | |
| return { | |
| "total": round(total, 2), | |
| "content_coverage": round(content_coverage, 2), | |
| "clarity": round(clarity, 2), | |
| "coherence": round(coherence, 2) | |
| } |