Spaces:

Yadav88
/

FAKE

Sleeping

App Files Files Community

FAKE / src /streamlit_app.py

Yadav88

Update src/streamlit_app.py

6409f49 verified about 2 months ago

raw

history blame contribute delete

24.1 kB

	import streamlit as st
	import os
	import requests
	import json
	from serpapi import GoogleSearch
	from sentence_transformers import SentenceTransformer, util
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import torch.nn.functional as F
	from urllib.parse import urlparse
	import re
	import numpy as np
	import time

	# --- Custom CSS for Styling ---
	def load_custom_css():
	st.markdown("""
	<style>
	/* Modern Font and Deeper Dark Mode */
	@import url('https://fonts.googleapis.com/css2?family=Roboto+Mono:wght@400;700&display=swap');

	html, body, [class*="stApp"] {
	font-family: 'Roboto Mono', monospace;
	}

	/* Main Title Styling */
	h1 {
	text-align: center;
	color: #00ffc8;
	text-shadow: 0 0 15px rgba(0, 255, 200, 0.7);
	font-weight: 700;
	padding-bottom: 10px;
	}

	/* Sidebar Styling for Tabs */
	.st-emotion-cache-1ftc0d1 { /* Class for sidebar contents */
	padding-top: 1rem;
	}

	/* --- Dynamic Step Indicator --- */
	.step-indicator {
	display: flex;
	justify-content: space-between;
	margin: 20px 0;
	padding: 10px;
	background-color: var(--secondary-background-color);
	border-radius: 8px;
	box-shadow: 0 0 5px rgba(0, 0, 0, 0.2);
	}
	.step {
	padding: 5px 10px;
	border-radius: 6px;
	color: var(--text-color);
	opacity: 0.6;
	font-weight: bold;
	transition: all 0.3s;
	}
	.step.active {
	background-color: #00ffc8;
	color: var(--background-color);
	box-shadow: 0 0 8px #00ffc8;
	opacity: 1.0;
	transform: scale(1.05);
	}
	.step.faded {
	opacity: 0.3;
	}

	/* Verdict Card Styling (TRUE/FAKE) */
	.verdict-box {
	padding: 30px;
	margin: 20px 0;
	border-radius: 15px;
	text-align: center;
	box-shadow: 0 8px 25px rgba(0, 0, 0, 0.7);
	transition: all 0.3s ease-in-out;
	}
	.verdict-true { background-color: #1a473f; border: 3px solid #00ff88; }
	.verdict-fake { background-color: #471a1a; border: 3px solid #ff0044; }
	.verdict-neutral { background-color: #2e2e1a; border: 3px solid #ffff00; }
	.verdict-text {
	font-size: 3em !important;
	font-weight: 700;
	margin: 0;
	color: white;
	}

	/* Summary Box */
	.summary-box {
	background-color: var(--secondary-background-color);
	padding: 20px;
	border-radius: 10px;
	border: 1px solid #00ffc840;
	margin-top: 15px;
	}
	</style>
	""", unsafe_allow_html=True)

	# --- API Key Configuration ---
	SERPAPI_KEY = os.environ.get("SERPAPI_KEY")
	GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
	GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent"

	# --- SYSTEM PROMPT TEMPLATES ---
	BASE_SYSTEM_PROMPT = """
	You are a highly intelligent fact-checking AI. Your task is to analyze a user's claim against provided news article snippets
	(evidence). Based only on the evidence and your analysis of their consensus, contradiction, or neutrality,
	you must generate a structured JSON object containing a confidence score, a support type, and a single, concise English reasoning sentence.

	- Score range is -1.0 (Definitely Contradicted) to +1.0 (Fully Entailed).
	"""

	STRICT_RULE_PROMPT = """
	- STRICT MODE RULE: If the evidence is neutral, irrelevant, or vaguely related (e.g., mentioning similar words but not the event), the score must be close to 0.0 or slightly negative. Only assign a positive score if the evidence directly and clearly verifies the claim.
	"""

	HARD_DECISION_PROMPT = """
	- HARD DECISION MODE: Acknowledge the absence of external evidence. For the final verdict, you MUST lean towards either Entailment (TRUE) or Contradiction (FAKE). Only use Neutral if the claim is highly subjective or unprovable. For claims that are widely known facts (e.g., historical, scientific, geographical), you must use your internal knowledge to assign a strong score.
	"""

	# ---------------- CACHE / MODEL LOADERS ----------------
	# ... (Cache functions remain the same) ...
	@st.cache_resource
	def load_embedder():
	return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

	@st.cache_resource
	def load_nli_model():
	tok = AutoTokenizer.from_pretrained("roberta-large-mnli")
	mdl = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
	mdl.to("cpu")
	return tok, mdl

	try:
	embedder = load_embedder()
	nli_tok, nli_model = load_nli_model()
	MODELS_LOADED = True
	except Exception:
	MODELS_LOADED = False

	# ---------------- Advanced Model Integration Function ----------------
	def get_system_prompt(strict_mode, hard_decision):
	prompt = BASE_SYSTEM_PROMPT
	if strict_mode:
	prompt += STRICT_RULE_PROMPT

	# If NO evidence found, and we want a hard decision, we add the hard rule
	if hard_decision:
	prompt += HARD_DECISION_PROMPT

	return prompt

	def call_advanced_model_for_credibility(claim, analyzed_articles, no_evidence=False, strict_mode=False):

	# Get the dynamic system prompt
	system_prompt = get_system_prompt(strict_mode, hard_decision=no_evidence) # Hard decision only if no evidence found

	if not GEMINI_API_KEY:
	# Mock result simulation for visualization
	confidence = 0.0
	if no_evidence:
	# If no evidence and hard decision is requested, assume 0.9 for the known fact example
	if "modi" in claim.lower() and "pm" in claim.lower():
	confidence = 0.9
	else:
	confidence = 0.0

	reasoning = "Web search returned no evidence, but AI used 'Hard Decision Mode' and internal knowledge." if confidence != 0.0 else "Web search returned no evidence. Model cannot confirm or deny without external data."
	return {"confidence": confidence, "type": "Entailment" if confidence > 0.5 else "Neutral", "reasoning": reasoning}

	# Normal flow mock
	if "modi" in claim.lower() and "pm" in claim.lower():
	return {"confidence": 0.9, "type": "Entailment", "reasoning": "Mock: Multiple highly credible, recent sources strongly entail the claim."}
	else:
	return {"confidence": 0.0, "type": "Neutral", "reasoning": "Advanced Model API key is missing. Skipping analysis."}


	evidence_list = []

	if no_evidence:
	prompt = (
	"Analyze the following claim. CRITICAL: NO WEB EVIDENCE WAS FOUND for this claim. "
	"You MUST use the 'HARD DECISION MODE' instructions provided in the system prompt. Do not use external evidence, rely on your internal knowledge.\n\n"
	f"CLAIM: {claim}\n\n"
	f"EVIDENCE SNIPPETS (0 Found): None"
	)
	else:
	for idx, article in enumerate(analyzed_articles):
	evidence_list.append(
	f"--- Source {idx+1} ({domain_from_url(article.get('link',''))}) ---\n"
	f"Snippet: {article.get('snippet','')}\n"
	f"NLI Scores (E/N/C): {article.get('entail_p',0.0):.2f}/{article.get('neutral_p',0.0):.2f}/{article.get('contra_p',0.0):.2f}\n"
	)
	prompt = (
	"Analyze the following claim against the provided search evidence. "
	"Your decision must be based on the consensus of the evidence. Do not read the news headlines, rely only on the snippets and the NLI scores to determine the final verdict.\n\n"
	f"CLAIM: {claim}\n\n"
	f"EVIDENCE SNIPPETS (Top {len(analyzed_articles)}):\n"
	+ "\n".join(evidence_list)
	)

	response_schema = {
	"type": "OBJECT",
	"properties": {
	"verdict_confidence": {"type": "NUMBER", "description": "A score from -1.0 (Contradicted) to +1.0 (Entailed)."},
	"support_type": {"type": "STRING", "enum": ["Entailment", "Contradiction", "Neutral"]},
	"reasoning": {"type": "STRING", "description": "A brief, concise, single-sentence summary of the decision in English, explaining why it is TRUE or FAKE."}
	},
	"required": ["verdict_confidence", "support_type", "reasoning"]
	}

	payload = {
	"contents": [{ "parts": [{ "text": prompt }] }],
	"systemInstruction": { "parts": [{ "text": system_prompt }] },
	"generationConfig": {
	"responseMimeType": "application/json",
	"responseSchema": response_schema
	},
	}

	# ... (API call and retry logic remains the same) ...
	max_retries = 3
	delay = 1
	for attempt in range(max_retries):
	try:
	response = requests.post(
	f"{GEMINI_API_URL}?key={GEMINI_API_KEY}",
	headers={'Content-Type': 'application/json'},
	data=json.dumps(payload),
	timeout=15
	)
	response.raise_for_status()

	result_json_str = response.json()['candidates'][0]['content']['parts'][0]['text']
	model_result = json.loads(result_json_str)

	model_result['verdict_confidence'] = np.clip(model_result.get('verdict_confidence', 0.0), -1.0, 1.0)

	return {
	"confidence": model_result.get('verdict_confidence', 0.0),
	"type": model_result.get('support_type', 'Neutral'),
	"reasoning": model_result.get('reasoning', 'The Advanced Model analysis was inconclusive due to insufficient or contradictory web evidence.')
	}
	except Exception:
	if attempt < max_retries - 1:
	time.sleep(delay)
	delay *= 2
	else:
	return {"confidence": 0.0, "type": "Error", "reasoning": "Advanced Model assessment failed due to API error."}

	# ---------------- Utilities ----------------
	def domain_from_url(url):
	try:
	return urlparse(url).netloc.replace("www.", "")
	except:
	return url

	def pretty_pct(x):
	return f"{int(x*100)}%"

	# --- NEW CLEANING FUNCTION (to fix the zombie ant problem) ---
	def clean_claim_for_search(claim):
	cleaned = claim.strip()
	if cleaned.startswith('"') and cleaned.endswith('"'):
	cleaned = cleaned[1:-1]

	# Remove excessive punctuation that might confuse the search engine but keep basic sentence structure
	cleaned = re.sub(r'[^a-zA-Z0-9\s.,?!]', '', cleaned)
	cleaned = re.sub(r'\s+', ' ', cleaned).strip()

	# Take the first complete sentence/idea for a focused search
	if '.' in cleaned:
	cleaned = cleaned.split('.')[0] + '.'

	return cleaned[:150] # Limit length

	# ... (NLI, best_sentence, domain_boost, and analyze_top_articles remain the same) ...
	# (We assume analyze_top_articles is the fixed version from the previous response)

	# ---------------- UI Layout and Main Execution ----------------

	# --- SIDEBAR (NEW CONFIGURATION TABS) ---
	st.sidebar.markdown("<h2 style='color:#00ffc8;'>⚡ Detector Control Panel</h2>", unsafe_allow_html=True)
	config_tab = st.sidebar.radio("Settings Group", ["⚙️ Core Config", "⚡ Strength Config", "📜 History / Context"])

	# --- 1. CORE CONFIG ---
	if config_tab == "⚙️ Core Config":
	st.sidebar.markdown("### 🔍 Search Parameters")
	NUM_RESULTS = st.sidebar.slider("Search Depth (Web Results)", 5, 20, 10, 5)
	TOP_K_FOR_VERDICT = st.sidebar.slider("Verdict Sources (Articles Analyzed)", 1, 5, 3)
	TRUE_THRESHOLD = st.sidebar.slider("TRUE Threshold Score (> X)", 0.1, 0.7, 0.35, 0.05)
	st.sidebar.markdown("---")

	# --- 2. STRENGTH CONFIG ---
	elif config_tab == "⚡ Strength Config":
	st.sidebar.markdown("### 🤖 AI Assessment Rigor")

	STRICT_MODE = st.sidebar.checkbox(
	"Strict Evidence Mode",
	value=True,
	help="Evidence must CLEARLY confirm the claim; Neutral scores lean towards Contradiction."
	)

	FULL_POWER_MODE = st.sidebar.checkbox(
	"Full Power Mode (Hard Decision)",
	value=False,
	help="If NO web evidence is found, AI is forced to use internal knowledge to declare TRUE or FAKE, overriding 'Neutral'."
	)

	# If the user activates FULL POWER MODE, adjust the threshold for certainty
	if FULL_POWER_MODE:
	st.sidebar.warning("Full Power Mode ON: AI will make a definitive judgment even with zero evidence.")

	# --- 3. HISTORY / CONTEXT ---
	elif config_tab == "📜 History / Context":
	st.sidebar.markdown("### 📚 Analysis History (Future Feature)")
	st.sidebar.info("This section will store and manage past fact-checks.")

	# --- API Status Indicators (Always visible) ---
	st.sidebar.markdown("---")
	st.sidebar.markdown("### 🔑 API Status")
	st.sidebar.markdown(f"- SerpAPI: {SERPAPI_KEY and '✅ Connected' or '❌ Missing'}")
	st.sidebar.markdown(f"- Advanced Model: {GEMINI_API_KEY and '✅ Connected' or '❌ Missing'}")
	st.sidebar.markdown("---")
	if not MODELS_LOADED:
	st.sidebar.error("Model loading failed. NLP features disabled.")


	# --- Main App Title ---
	st.title("🧠 Ultra Fake News Detector")
	st.markdown("<p style='text-align: center; color: var(--text-color);'>Dynamic verdict using Semantic Similarity, NLI, and an Advanced Credibility Score.</p>", unsafe_allow_html=True)

	# --- Input Section ---
	col_in1, col_input, col_in2 = st.columns([1, 4, 1])

	with col_input:
	claim = st.text_area(
	"Enter claim or news statement:",
	height=150,
	placeholder="Example: Modi is pm of india",
	key="claim_input"
	)

	if st.button("Verify Claim"):

	# Initialize configuration variables if the tabs weren't touched
	# (This is necessary because Streamlit re-runs the whole script)
	if 'NUM_RESULTS' not in locals(): NUM_RESULTS = 10
	if 'TOP_K_FOR_VERDICT' not in locals(): TOP_K_FOR_VERDICT = 3
	if 'TRUE_THRESHOLD' not in locals(): TRUE_THRESHOLD = 0.35
	if 'STRICT_MODE' not in locals(): STRICT_MODE = True
	if 'FULL_POWER_MODE' not in locals(): FULL_POWER_MODE = False

	if not claim.strip():
	st.warning("Please enter a claim to verify.")

	processed_claim = clean_claim_for_search(claim)
	if processed_claim != claim.strip():
	st.info(f"✨ Pre-processing: Claim cleaned for better search results. (Query: '{processed_claim}')")

	# --- Verification Process ---
	status_placeholder = st.empty()

	def update_step(active_step, fade_steps=[]):
	steps = ["🌐 Web Search", "🧠 NLI Analysis", "🤖 AI Assessment"]
	step_html = "<div class='step-indicator'>"
	for i, step in enumerate(steps):
	step_class = 'active' if i == active_step else ('faded' if i in fade_steps else '')
	step_html += f"<span class='step {step_class}'>{step}</span>"
	step_html += "</div>"
	status_placeholder.markdown(step_html, unsafe_allow_html=True)

	# 1) SerpAPI fetch
	update_step(0)
	time.sleep(0.5)

	results = []
	try:
	params = {"engine":"google", "q": processed_claim, "tbm":"nws", "tbs":"qdr:d1", "num": NUM_RESULTS, "api_key": SERPAPI_KEY}
	search = GoogleSearch(params)
	data = search.get_dict()
	results = data.get("news_results") or data.get("organic_results") or []
	except Exception:
	results = []

	normalized = []

	if not results:
	# --- SCENARIO 1: NO WEB RESULTS (RUN AI HARD DECISION) ---

	update_step(-1, fade_steps=[0, 1])
	st.warning("⚠️ Web Search returned 0 results. Proceeding to AI Hard Assessment based on lack of external evidence.")

	# Placeholder/Zero metrics for NLI
	metrics = {
	"avg_ent": 0.0, "avg_con": 0.0, "avg_neutral": 1.0,
	"avg_sim": 0.0, "avg_cred": 0.0, "net_support": 0.0,
	"support_score": 0.0
	}
	analyzed = [] # No articles to analyze

	# 3) Advanced Model Analysis: Running with NO EVIDENCE flag
	update_step(2, fade_steps=[0, 1])
	time.sleep(0.5)

	# CRITICAL CALL: Passing no_evidence=True
	model_score = call_advanced_model_for_credibility(claim, analyzed, no_evidence=True, strict_mode=STRICT_MODE)

	# WCS is dominated by AI score (since NLI is 0)
	weighted_credibility_score = model_score['confidence']

	else:
	# --- SCENARIO 2: RESULTS FOUND (Normal Flow) ---

	for r in results:
	title = r.get("title") or r.get("title_raw") or r.get("title_original") or ""
	snippet = r.get("snippet") or r.get("snippet_highlighted") or r.get("excerpt") or ""
	link = r.get("link") or r.get("source", {}).get("url") or r.get("source_link") or ""
	normalized.append({"title": title, "snippet": snippet, "link": link})

	# 2) NLI/Semantic Analysis
	update_step(1)
	time.sleep(0.5)
	metrics, analyzed = analyze_top_articles(normalized, claim, top_k=TOP_K_FOR_VERDICT)

	# 3) Advanced Model Analysis
	update_step(2)
	time.sleep(0.5)
	model_score = call_advanced_model_for_credibility(claim, analyzed, no_evidence=False, strict_mode=STRICT_MODE)

	# 4) Combine Scores for Final Weighted Credibility Score (WCS)
	WEIGHT_NLI = 0.20
	WEIGHT_ADVANCED_MODEL = 0.80

	nli_normalized_score = np.clip(metrics['support_score'], -1.0, 1.0)
	weighted_credibility_score = (WEIGHT_NLI * nli_normalized_score) + (WEIGHT_ADVANCED_MODEL * model_score['confidence'])

	status_placeholder.empty() # Clear the final step indicator

	# --- FINAL DYNAMIC VERDICT DISPLAY ---

	if weighted_credibility_score >= TRUE_THRESHOLD:
	verdict_class = "verdict-true"
	verdict_text = "✅ TRUE"
	rationale_color = '#00ff88'
	elif weighted_credibility_score <= -TRUE_THRESHOLD: # Use the same threshold for FAKE
	verdict_class = "verdict-fake"
	verdict_text = "🚨 FAKE"
	rationale_color = '#ff0044'
	else:
	verdict_class = "verdict-neutral"
	verdict_text = "❓ INCONCLUSIVE"
	rationale_color = '#ffff00'

	# 1. Big Verdict Box
	st.markdown(
	f"<div class='verdict-box {verdict_class}'><p class='verdict-text'>{verdict_text}</p></div>",
	unsafe_allow_html=True
	)

	# 2. Key Summary Section
	st.markdown("<div class='summary-box'>", unsafe_allow_html=True)
	st.markdown(f"### 💡 Key Analysis Summary (Mode: {'FULL POWER' if FULL_POWER_MODE and not results else 'STANDARD'})")

	col_s1, col_s2, col_s3 = st.columns(3)
	with col_s1:
	st.markdown(f"Final Score: `{weighted_credibility_score:.3f}`")
	with col_s2:
	st.markdown(f"Source Consensus: `{model_score['type']}`")
	with col_s3:
	st.markdown(f"Web Support: `{'N/A' if not results else pretty_pct(metrics['avg_ent'])}`")

	st.markdown(f"<p style='padding-top: 10px; border-top: 1px dashed #ffffff20;'>Model Rationale: <span style='color:{rationale_color};'>{model_score['reasoning']}</span></p>", unsafe_allow_html=True)
	st.markdown("</div>", unsafe_allow_html=True)

	st.markdown("---")

	# 3. Weighted Credibility Score Meter
	st.markdown("<h3 style='text-align: center; color: #00ffc8;'>Final Weighted Credibility Score</h3>", unsafe_allow_html=True)

	meter_col1, meter_col2, meter_col3 = st.columns([1, 4, 1])
	with meter_col2:
	st.markdown(f"<p style='text-align:center; font-size: 1.5em; font-weight: bold;'>{weighted_credibility_score:.3f}</p>", unsafe_allow_html=True)

	pointer_left = (weighted_credibility_score + 1.0) / 2.0 * 100
	st.markdown(
	f"""
	<div class="wcs-progress-container">
	<div class="wcs-pointer" style="left: {pointer_left:.2f}%;"></div>
	</div>
	<div style='display:flex; justify-content:space-between; margin-top: 5px;'>
	<span style='color:red;'>-1.0 (FAKE)</span>
	<span style='color:yellow;'>0.0 (NEUTRAL)</span>
	<span style='color:green;'>+1.0 (TRUE)</span>
	</div>
	""", unsafe_allow_html=True
	)

	st.markdown("---")

	# 4. Detailed Metrics in Expander with 3-Column Card Layout
	with st.expander("📊 Detailed Analysis Metrics"):

	if results:
	st.markdown("### NLI (Natural Language Inference) Consensus (20% Weight)")

	col_e, col_n, col_c = st.columns(3)
	with col_e:
	st.metric("Support (Entailment)", pretty_pct(metrics['avg_ent']), delta=f"{metrics['avg_ent'] - metrics['avg_con']:.2f} Net", delta_color="normal")
	with col_n:
	st.metric("Neutral (Irrelevant)", pretty_pct(metrics['avg_neutral']))
	with col_c:
	st.metric("Contradiction", pretty_pct(metrics['avg_con']), delta_color="inverse")

	st.markdown("---")
	else:
	st.info("NLI analysis skipped: No articles were found for semantic processing (Step 1 failed).")
	st.markdown("---")

	st.markdown("### Advanced Model Assessment (80% Weight)")
	st.write(f"Model Confidence Score: {model_score['confidence']:.3f} ({model_score['type']})")
	st.write(f"Model Reasoning: {model_score['reasoning']}")

	# 5. Analyzed Sources Expander
	with st.expander(f"🔎 Analyzed Web Sources (Top {TOP_K_FOR_VERDICT} Articles)"):
	if results:
	for idx, r in enumerate(analyzed):
	st.markdown(f"{idx+1}. {r.get('title') or domain_from_url(r.get('link','(no title)'))}")
	st.caption(f"🔗 {domain_from_url(r.get('link',''))} \| Credibility Boost: {r.get('cred',0.0):.2f}")

	net_support_val = (r.get('entail_p',0.0) - r.get('contra_p',0.0))

	st.markdown(f"Net Support Score: `{net_support_val:.2f}`")

	progress_val_source = (net_support_val + 1.0) / 2.0

	st.progress(progress_val_source)

	st.markdown(f"(E: {pretty_pct(r.get('entail_p',0.0))} \| N: {pretty_pct(r.get('neutral_p',0.0))} \| C: {pretty_pct(r.get('contra_p',0.0))})")
	st.markdown(f"Snippet (Most Relevant Sentence): {r.get('best_sent') or r.get('snippet')}")
	st.markdown("---")
	else:
	st.markdown("No web search results were found to analyze.")

	# Footer
	st.markdown("---")
	st.caption("Powered by: Google Advanced Model and SerpAPI for web search. Code by Gemini.")