Spaces:

NIKKI77
/

ks-version-1-1

Sleeping

App Files Files Community

ks-version-1-1 / backend /app.py

NIKKI77

Subtitle KIS v1.1 – initial

5181b3c 3 months ago

raw

history blame contribute delete

7.79 kB

	# Flask app for Subtitle KIS — main routes + search flow
	import os
	import re
	import json as flask_json
	from flask import Flask, render_template, request, jsonify
	from markupsafe import escape, Markup
	from nltk.corpus import wordnet
	from nltk.stem import WordNetLemmatizer
	# NOTE: heavy imports moved to lazy inside perform_search()
	# from semantic_search import search_query
	# from nlp_summary import summarize_text
	from autocomplete import get_suggestions
	from config import ABBREVIATION_MAP, VIDEO_METADATA, SEARCH_CONFIG

	# App setup
	template_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'templates')
	app = Flask(__name__, template_folder=template_dir)

	# Security headers:Content Security Policy
	@app.after_request
	def apply_csp(response):
	response.headers["Content-Security-Policy"] = (
	"default-src 'self'; "
	"img-src 'self' https://img.youtube.com data:; "
	"script-src 'self' 'unsafe-inline'; "
	"style-src 'self' 'unsafe-inline';"
	)
	return response

	# Route: Home page
	@app.route("/")
	def index():
	return render_template("index.html")

	# Health check (fast) — for HF Spaces readiness
	@app.get("/health")
	def health():
	return {"ok": True}, 200

	# Template filter: convert HH:MM:SS to seconds
	@app.template_filter("jump_time")
	def jump_time(timestamp):
	try:
	h, m, s = timestamp.split(':')
	total = int(h) * 3600 + int(m) * 60 + int(float(s))
	return max(total - 2, 0)
	except:
	return 0

	# NLP helpers: lemmatizer + synonym expansion
	lemmatizer = WordNetLemmatizer()

	def get_synonyms(word):
	"""Return a set of synonyms for a single word."""
	synonyms = set()
	for syn in wordnet.synsets(word):
	for lemma in syn.lemmas():
	synonyms.add(lemma.name().replace("_", " "))
	return synonyms

	# Highlighting:
	def highlight_keywords(text, keyword, semantic_mode=False):
	"""
	Highlight exact matches always.
	In semantic mode, also highlight synonyms and lemmas.
	"""
	safe_text = escape(text)

	if len(keyword) <= 3:
	pattern = re.compile(rf"(?<!\w){re.escape(keyword)}(?!\w)", re.IGNORECASE)
	else:
	pattern = re.compile(re.escape(keyword), re.IGNORECASE)

	if pattern.search(safe_text):
	return pattern.sub(lambda m: f"<mark>{m.group(0)}</mark>", safe_text)

	# Semantic mode
	if semantic_mode:
	words = keyword.split()
	for w in words:
	lemma = lemmatizer.lemmatize(w.lower())
	candidates = {lemma} \| get_synonyms(w)
	for cand in candidates:
	if len(cand) <= 3:
	syn_pattern = re.compile(rf"(?<!\w){re.escape(cand)}(?!\w)", re.IGNORECASE)
	else:
	syn_pattern = re.compile(rf"\b{re.escape(cand)}\b", re.IGNORECASE)
	if syn_pattern.search(safe_text):
	safe_text = syn_pattern.sub(lambda m: f"<mark>{m.group(0)}</mark>", safe_text)
	return safe_text

	return safe_text

	# Core search orchestration
	def perform_search(query, start=0, shown=0, previous_results=None, semantic_mode=False):
	"""Shared search logic for both HTML and JSON endpoints."""
	if previous_results is None:
	previous_results = []

	# 🔸 Lazy imports so heavy modules load on first search, not at boot
	from semantic_search import search_query
	from nlp_summary import summarize_text

	raw_results, _ = search_query(query, offset=0, top_k=1000, semantic_mode=semantic_mode)

	# Keyword mode
	if not semantic_mode:
	raw_results = [r for r in raw_results if re.search(re.escape(query), r["text"], re.IGNORECASE)]

	page_size = SEARCH_CONFIG.get("results_per_page", 5)
	paged_results = raw_results[start:start + page_size]

	new_results = []
	for idx, r in enumerate(paged_results):
	vid_id = r.get("video_id")
	friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid_id), None)
	r["video_title"] = VIDEO_METADATA.get(friendly_key, {}).get("title", "Unknown Title")

	context_chunks = []
	if idx > 0:
	context_chunks.append(paged_results[idx - 1]["summary_input"])
	context_chunks.append(r["summary_input"])
	if idx + 1 < len(paged_results):
	context_chunks.append(paged_results[idx + 1]["summary_input"])

	summary = summarize_text(" ".join(context_chunks), query=query)

	highlighted_before = highlight_keywords(r["context_before"], query, semantic_mode)
	highlighted_match = highlight_keywords(r["text"], query, semantic_mode)
	highlighted_after = highlight_keywords(r["context_after"], query, semantic_mode)

	r["highlighted_block"] = Markup(f"{highlighted_before}\n{highlighted_match}\n{highlighted_after}")
	r["summary"] = summary
	new_results.append(r)

	combined_results = previous_results + new_results
	shown += len(new_results)

	return combined_results, len(raw_results), shown, start + len(new_results)

	# HTML endpoint
	@app.route("/search", methods=["POST"])
	def search():
	query = request.form.get("query", "").strip()
	if not query:
	return render_template("index.html", error="Please enter a search query.")

	semantic_mode = request.form.get("semantic") == "true"
	start = int(request.form.get("start", 0))
	try:
	previous_results = flask_json.loads(request.form.get("previous_results", "[]"))
	except:
	previous_results = []

	for r in previous_results:
	if isinstance(r, dict) and "highlighted_block" in r:
	r["highlighted_block"] = Markup(r["highlighted_block"])

	shown = int(request.form.get("shown", len(previous_results)))

	combined_results, total_matches, shown, next_start = perform_search(
	query, start, shown, previous_results, semantic_mode
	)

	# Abbreviation
	suggestion_term = ""
	lower_query = query.lower()
	if lower_query in ABBREVIATION_MAP:
	suggestion_term = ABBREVIATION_MAP[lower_query]
	elif lower_query in ABBREVIATION_MAP.values():
	for abbr, full in ABBREVIATION_MAP.items():
	if full == lower_query:
	suggestion_term = abbr
	break

	return render_template(
	"results.html",
	query=query,
	results=combined_results,
	shown=shown,
	start=next_start,
	total_matches=total_matches,
	previous_results=combined_results,
	suggestion_term=suggestion_term,
	semantic=semantic_mode
	)

	# JSON API endpoint
	@app.route("/api/search", methods=["POST"])
	def api_search():
	data = request.get_json(force=True)
	query = data.get("query", "").strip()
	semantic_mode = bool(data.get("semantic", False))
	start = int(data.get("start", 0))
	shown = int(data.get("shown", 0))
	previous_results = data.get("previous_results", [])

	combined_results, total_matches, shown, next_start = perform_search(
	query, start, shown, previous_results, semantic_mode
	)

	rendered_cards = [
	render_template("_result_card.html", result=r, query=query, semantic=semantic_mode)
	for r in combined_results[-SEARCH_CONFIG.get("results_per_page", 5):]
	]

	return jsonify({
	"html": rendered_cards,
	"shown": shown,
	"total_matches": total_matches,
	"next_start": next_start,
	"has_more": next_start < total_matches
	})

	# Autocomplete endpoint
	@app.route("/autocomplete", methods=["GET"])
	def autocomplete():
	term = request.args.get("term", "")
	return flask_json.dumps(get_suggestions(term))

	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860)) # HF Spaces default
	app.run(host="0.0.0.0", port=port, debug=False)