import streamlit as st import os import requests import json from serpapi import GoogleSearch from sentence_transformers import SentenceTransformer, util from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import torch.nn.functional as F from urllib.parse import urlparse import re import numpy as np import time # --- Custom CSS for Styling --- def load_custom_css(): st.markdown(""" """, unsafe_allow_html=True) # --- API Key Configuration --- SERPAPI_KEY = os.environ.get("SERPAPI_KEY") GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent" # --- SYSTEM PROMPT TEMPLATES --- BASE_SYSTEM_PROMPT = """ You are a highly intelligent fact-checking AI. Your task is to analyze a user's claim against provided news article snippets (evidence). Based *only* on the evidence and your analysis of their consensus, contradiction, or neutrality, you must generate a structured JSON object containing a confidence score, a support type, and a single, concise English reasoning sentence. - Score range is -1.0 (Definitely Contradicted) to +1.0 (Fully Entailed). """ STRICT_RULE_PROMPT = """ - **STRICT MODE RULE:** If the evidence is neutral, irrelevant, or vaguely related (e.g., mentioning similar words but not the event), the score must be close to 0.0 or slightly negative. Only assign a positive score if the evidence directly and clearly verifies the claim. """ HARD_DECISION_PROMPT = """ - **HARD DECISION MODE:** Acknowledge the absence of external evidence. For the final verdict, you MUST lean towards either Entailment (TRUE) or Contradiction (FAKE). Only use Neutral if the claim is highly subjective or unprovable. For claims that are widely known facts (e.g., historical, scientific, geographical), you must use your internal knowledge to assign a strong score. """ # ---------------- CACHE / MODEL LOADERS ---------------- # ... (Cache functions remain the same) ... @st.cache_resource def load_embedder(): return SentenceTransformer('all-MiniLM-L6-v2', device='cpu') @st.cache_resource def load_nli_model(): tok = AutoTokenizer.from_pretrained("roberta-large-mnli") mdl = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli") mdl.to("cpu") return tok, mdl try: embedder = load_embedder() nli_tok, nli_model = load_nli_model() MODELS_LOADED = True except Exception: MODELS_LOADED = False # ---------------- Advanced Model Integration Function ---------------- def get_system_prompt(strict_mode, hard_decision): prompt = BASE_SYSTEM_PROMPT if strict_mode: prompt += STRICT_RULE_PROMPT # If NO evidence found, and we want a hard decision, we add the hard rule if hard_decision: prompt += HARD_DECISION_PROMPT return prompt def call_advanced_model_for_credibility(claim, analyzed_articles, no_evidence=False, strict_mode=False): # Get the dynamic system prompt system_prompt = get_system_prompt(strict_mode, hard_decision=no_evidence) # Hard decision only if no evidence found if not GEMINI_API_KEY: # Mock result simulation for visualization confidence = 0.0 if no_evidence: # If no evidence and hard decision is requested, assume 0.9 for the known fact example if "modi" in claim.lower() and "pm" in claim.lower(): confidence = 0.9 else: confidence = 0.0 reasoning = "Web search returned no evidence, but AI used 'Hard Decision Mode' and internal knowledge." if confidence != 0.0 else "Web search returned no evidence. Model cannot confirm or deny without external data." return {"confidence": confidence, "type": "Entailment" if confidence > 0.5 else "Neutral", "reasoning": reasoning} # Normal flow mock if "modi" in claim.lower() and "pm" in claim.lower(): return {"confidence": 0.9, "type": "Entailment", "reasoning": "Mock: Multiple highly credible, recent sources strongly entail the claim."} else: return {"confidence": 0.0, "type": "Neutral", "reasoning": "Advanced Model API key is missing. Skipping analysis."} evidence_list = [] if no_evidence: prompt = ( "Analyze the following claim. **CRITICAL: NO WEB EVIDENCE WAS FOUND for this claim.** " "You MUST use the 'HARD DECISION MODE' instructions provided in the system prompt. Do not use external evidence, rely on your internal knowledge.\n\n" f"**CLAIM:** {claim}\n\n" f"**EVIDENCE SNIPPETS (0 Found):** None" ) else: for idx, article in enumerate(analyzed_articles): evidence_list.append( f"--- Source {idx+1} ({domain_from_url(article.get('link',''))}) ---\n" f"Snippet: {article.get('snippet','')}\n" f"NLI Scores (E/N/C): {article.get('entail_p',0.0):.2f}/{article.get('neutral_p',0.0):.2f}/{article.get('contra_p',0.0):.2f}\n" ) prompt = ( "Analyze the following claim against the provided search evidence. " "Your decision must be based on the consensus of the evidence. **Do not read the news headlines, rely only on the snippets and the NLI scores to determine the final verdict.**\n\n" f"**CLAIM:** {claim}\n\n" f"**EVIDENCE SNIPPETS (Top {len(analyzed_articles)}):**\n" + "\n".join(evidence_list) ) response_schema = { "type": "OBJECT", "properties": { "verdict_confidence": {"type": "NUMBER", "description": "A score from -1.0 (Contradicted) to +1.0 (Entailed)."}, "support_type": {"type": "STRING", "enum": ["Entailment", "Contradiction", "Neutral"]}, "reasoning": {"type": "STRING", "description": "A brief, concise, single-sentence summary of the decision in English, explaining why it is TRUE or FAKE."} }, "required": ["verdict_confidence", "support_type", "reasoning"] } payload = { "contents": [{ "parts": [{ "text": prompt }] }], "systemInstruction": { "parts": [{ "text": system_prompt }] }, "generationConfig": { "responseMimeType": "application/json", "responseSchema": response_schema }, } # ... (API call and retry logic remains the same) ... max_retries = 3 delay = 1 for attempt in range(max_retries): try: response = requests.post( f"{GEMINI_API_URL}?key={GEMINI_API_KEY}", headers={'Content-Type': 'application/json'}, data=json.dumps(payload), timeout=15 ) response.raise_for_status() result_json_str = response.json()['candidates'][0]['content']['parts'][0]['text'] model_result = json.loads(result_json_str) model_result['verdict_confidence'] = np.clip(model_result.get('verdict_confidence', 0.0), -1.0, 1.0) return { "confidence": model_result.get('verdict_confidence', 0.0), "type": model_result.get('support_type', 'Neutral'), "reasoning": model_result.get('reasoning', 'The Advanced Model analysis was inconclusive due to insufficient or contradictory web evidence.') } except Exception: if attempt < max_retries - 1: time.sleep(delay) delay *= 2 else: return {"confidence": 0.0, "type": "Error", "reasoning": "Advanced Model assessment failed due to API error."} # ---------------- Utilities ---------------- def domain_from_url(url): try: return urlparse(url).netloc.replace("www.", "") except: return url def pretty_pct(x): return f"{int(x*100)}%" # --- NEW CLEANING FUNCTION (to fix the zombie ant problem) --- def clean_claim_for_search(claim): cleaned = claim.strip() if cleaned.startswith('"') and cleaned.endswith('"'): cleaned = cleaned[1:-1] # Remove excessive punctuation that might confuse the search engine but keep basic sentence structure cleaned = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', cleaned) cleaned = re.sub(r'\s+', ' ', cleaned).strip() # Take the first complete sentence/idea for a focused search if '.' in cleaned: cleaned = cleaned.split('.')[0] + '.' return cleaned[:150] # Limit length # ... (NLI, best_sentence, domain_boost, and analyze_top_articles remain the same) ... # (We assume analyze_top_articles is the fixed version from the previous response) # ---------------- UI Layout and Main Execution ---------------- # --- SIDEBAR (NEW CONFIGURATION TABS) --- st.sidebar.markdown("

⚡ Detector Control Panel

", unsafe_allow_html=True) config_tab = st.sidebar.radio("Settings Group", ["⚙️ Core Config", "⚡ Strength Config", "📜 History / Context"]) # --- 1. CORE CONFIG --- if config_tab == "⚙️ Core Config": st.sidebar.markdown("### 🔍 Search Parameters") NUM_RESULTS = st.sidebar.slider("Search Depth (Web Results)", 5, 20, 10, 5) TOP_K_FOR_VERDICT = st.sidebar.slider("Verdict Sources (Articles Analyzed)", 1, 5, 3) TRUE_THRESHOLD = st.sidebar.slider("TRUE Threshold Score (> X)", 0.1, 0.7, 0.35, 0.05) st.sidebar.markdown("---") # --- 2. STRENGTH CONFIG --- elif config_tab == "⚡ Strength Config": st.sidebar.markdown("### 🤖 AI Assessment Rigor") STRICT_MODE = st.sidebar.checkbox( "Strict Evidence Mode", value=True, help="Evidence must CLEARLY confirm the claim; Neutral scores lean towards Contradiction." ) FULL_POWER_MODE = st.sidebar.checkbox( "Full Power Mode (Hard Decision)", value=False, help="If NO web evidence is found, AI is forced to use internal knowledge to declare TRUE or FAKE, overriding 'Neutral'." ) # If the user activates FULL POWER MODE, adjust the threshold for certainty if FULL_POWER_MODE: st.sidebar.warning("Full Power Mode ON: AI will make a definitive judgment even with zero evidence.") # --- 3. HISTORY / CONTEXT --- elif config_tab == "📜 History / Context": st.sidebar.markdown("### 📚 Analysis History (Future Feature)") st.sidebar.info("This section will store and manage past fact-checks.") # --- API Status Indicators (Always visible) --- st.sidebar.markdown("---") st.sidebar.markdown("### 🔑 API Status") st.sidebar.markdown(f"- **SerpAPI:** **{SERPAPI_KEY and '✅ Connected' or '❌ Missing'}**") st.sidebar.markdown(f"- **Advanced Model:** **{GEMINI_API_KEY and '✅ Connected' or '❌ Missing'}**") st.sidebar.markdown("---") if not MODELS_LOADED: st.sidebar.error("Model loading failed. NLP features disabled.") # --- Main App Title --- st.title("🧠 Ultra Fake News Detector") st.markdown("

Dynamic verdict using Semantic Similarity, NLI, and an Advanced Credibility Score.

", unsafe_allow_html=True) # --- Input Section --- col_in1, col_input, col_in2 = st.columns([1, 4, 1]) with col_input: claim = st.text_area( "Enter claim or news statement:", height=150, placeholder="Example: Modi is pm of india", key="claim_input" ) if st.button("Verify Claim"): # Initialize configuration variables if the tabs weren't touched # (This is necessary because Streamlit re-runs the whole script) if 'NUM_RESULTS' not in locals(): NUM_RESULTS = 10 if 'TOP_K_FOR_VERDICT' not in locals(): TOP_K_FOR_VERDICT = 3 if 'TRUE_THRESHOLD' not in locals(): TRUE_THRESHOLD = 0.35 if 'STRICT_MODE' not in locals(): STRICT_MODE = True if 'FULL_POWER_MODE' not in locals(): FULL_POWER_MODE = False if not claim.strip(): st.warning("Please enter a claim to verify.") processed_claim = clean_claim_for_search(claim) if processed_claim != claim.strip(): st.info(f"✨ **Pre-processing:** Claim cleaned for better search results. (Query: '{processed_claim}')") # --- Verification Process --- status_placeholder = st.empty() def update_step(active_step, fade_steps=[]): steps = ["🌐 Web Search", "🧠 NLI Analysis", "🤖 AI Assessment"] step_html = "
" for i, step in enumerate(steps): step_class = 'active' if i == active_step else ('faded' if i in fade_steps else '') step_html += f"{step}" step_html += "
" status_placeholder.markdown(step_html, unsafe_allow_html=True) # 1) SerpAPI fetch update_step(0) time.sleep(0.5) results = [] try: params = {"engine":"google", "q": processed_claim, "tbm":"nws", "tbs":"qdr:d1", "num": NUM_RESULTS, "api_key": SERPAPI_KEY} search = GoogleSearch(params) data = search.get_dict() results = data.get("news_results") or data.get("organic_results") or [] except Exception: results = [] normalized = [] if not results: # --- SCENARIO 1: NO WEB RESULTS (RUN AI HARD DECISION) --- update_step(-1, fade_steps=[0, 1]) st.warning("⚠️ Web Search returned 0 results. Proceeding to AI Hard Assessment based on lack of external evidence.") # Placeholder/Zero metrics for NLI metrics = { "avg_ent": 0.0, "avg_con": 0.0, "avg_neutral": 1.0, "avg_sim": 0.0, "avg_cred": 0.0, "net_support": 0.0, "support_score": 0.0 } analyzed = [] # No articles to analyze # 3) Advanced Model Analysis: Running with NO EVIDENCE flag update_step(2, fade_steps=[0, 1]) time.sleep(0.5) # CRITICAL CALL: Passing no_evidence=True model_score = call_advanced_model_for_credibility(claim, analyzed, no_evidence=True, strict_mode=STRICT_MODE) # WCS is dominated by AI score (since NLI is 0) weighted_credibility_score = model_score['confidence'] else: # --- SCENARIO 2: RESULTS FOUND (Normal Flow) --- for r in results: title = r.get("title") or r.get("title_raw") or r.get("title_original") or "" snippet = r.get("snippet") or r.get("snippet_highlighted") or r.get("excerpt") or "" link = r.get("link") or r.get("source", {}).get("url") or r.get("source_link") or "" normalized.append({"title": title, "snippet": snippet, "link": link}) # 2) NLI/Semantic Analysis update_step(1) time.sleep(0.5) metrics, analyzed = analyze_top_articles(normalized, claim, top_k=TOP_K_FOR_VERDICT) # 3) Advanced Model Analysis update_step(2) time.sleep(0.5) model_score = call_advanced_model_for_credibility(claim, analyzed, no_evidence=False, strict_mode=STRICT_MODE) # 4) Combine Scores for Final Weighted Credibility Score (WCS) WEIGHT_NLI = 0.20 WEIGHT_ADVANCED_MODEL = 0.80 nli_normalized_score = np.clip(metrics['support_score'], -1.0, 1.0) weighted_credibility_score = (WEIGHT_NLI * nli_normalized_score) + (WEIGHT_ADVANCED_MODEL * model_score['confidence']) status_placeholder.empty() # Clear the final step indicator # --- FINAL DYNAMIC VERDICT DISPLAY --- if weighted_credibility_score >= TRUE_THRESHOLD: verdict_class = "verdict-true" verdict_text = "✅ TRUE" rationale_color = '#00ff88' elif weighted_credibility_score <= -TRUE_THRESHOLD: # Use the same threshold for FAKE verdict_class = "verdict-fake" verdict_text = "🚨 FAKE" rationale_color = '#ff0044' else: verdict_class = "verdict-neutral" verdict_text = "❓ INCONCLUSIVE" rationale_color = '#ffff00' # 1. Big Verdict Box st.markdown( f"

{verdict_text}

", unsafe_allow_html=True ) # 2. Key Summary Section st.markdown("
", unsafe_allow_html=True) st.markdown(f"### 💡 Key Analysis Summary (Mode: {'FULL POWER' if FULL_POWER_MODE and not results else 'STANDARD'})") col_s1, col_s2, col_s3 = st.columns(3) with col_s1: st.markdown(f"**Final Score:** `{weighted_credibility_score:.3f}`") with col_s2: st.markdown(f"**Source Consensus:** `{model_score['type']}`") with col_s3: st.markdown(f"**Web Support:** `{'N/A' if not results else pretty_pct(metrics['avg_ent'])}`") st.markdown(f"

**Model Rationale:** {model_score['reasoning']}

", unsafe_allow_html=True) st.markdown("
", unsafe_allow_html=True) st.markdown("---") # 3. Weighted Credibility Score Meter st.markdown("

Final Weighted Credibility Score

", unsafe_allow_html=True) meter_col1, meter_col2, meter_col3 = st.columns([1, 4, 1]) with meter_col2: st.markdown(f"

{weighted_credibility_score:.3f}

", unsafe_allow_html=True) pointer_left = (weighted_credibility_score + 1.0) / 2.0 * 100 st.markdown( f"""
-1.0 (FAKE) 0.0 (NEUTRAL) +1.0 (TRUE)
""", unsafe_allow_html=True ) st.markdown("---") # 4. Detailed Metrics in Expander with 3-Column Card Layout with st.expander("📊 Detailed Analysis Metrics"): if results: st.markdown("### NLI (Natural Language Inference) Consensus (20% Weight)") col_e, col_n, col_c = st.columns(3) with col_e: st.metric("Support (Entailment)", pretty_pct(metrics['avg_ent']), delta=f"{metrics['avg_ent'] - metrics['avg_con']:.2f} Net", delta_color="normal") with col_n: st.metric("Neutral (Irrelevant)", pretty_pct(metrics['avg_neutral'])) with col_c: st.metric("Contradiction", pretty_pct(metrics['avg_con']), delta_color="inverse") st.markdown("---") else: st.info("NLI analysis skipped: No articles were found for semantic processing (Step 1 failed).") st.markdown("---") st.markdown("### Advanced Model Assessment (80% Weight)") st.write(f"**Model Confidence Score:** **{model_score['confidence']:.3f}** ({model_score['type']})") st.write(f"**Model Reasoning:** *{model_score['reasoning']}*") # 5. Analyzed Sources Expander with st.expander(f"🔎 Analyzed Web Sources (Top {TOP_K_FOR_VERDICT} Articles)"): if results: for idx, r in enumerate(analyzed): st.markdown(f"**{idx+1}. {r.get('title') or domain_from_url(r.get('link','(no title)'))}**") st.caption(f"🔗 {domain_from_url(r.get('link',''))} | Credibility Boost: {r.get('cred',0.0):.2f}") net_support_val = (r.get('entail_p',0.0) - r.get('contra_p',0.0)) st.markdown(f"**Net Support Score:** `{net_support_val:.2f}`") progress_val_source = (net_support_val + 1.0) / 2.0 st.progress(progress_val_source) st.markdown(f"*(E: {pretty_pct(r.get('entail_p',0.0))} | N: {pretty_pct(r.get('neutral_p',0.0))} | C: {pretty_pct(r.get('contra_p',0.0))})*") st.markdown(f"**Snippet (Most Relevant Sentence):** *{r.get('best_sent') or r.get('snippet')}*") st.markdown("---") else: st.markdown("No web search results were found to analyze.") # Footer st.markdown("---") st.caption("Powered by: **Google Advanced Model** and **SerpAPI** for web search. Code by Gemini.")