NIKKI77's picture
Subtitle KIS v1.1 – initial
5181b3c
# Flask app for Subtitle KIS β€” main routes + search flow
import os
import re
import json as flask_json
from flask import Flask, render_template, request, jsonify
from markupsafe import escape, Markup
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
# NOTE: heavy imports moved to lazy inside perform_search()
# from semantic_search import search_query
# from nlp_summary import summarize_text
from autocomplete import get_suggestions
from config import ABBREVIATION_MAP, VIDEO_METADATA, SEARCH_CONFIG
# App setup
template_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'templates')
app = Flask(__name__, template_folder=template_dir)
# Security headers:Content Security Policy
@app.after_request
def apply_csp(response):
response.headers["Content-Security-Policy"] = (
"default-src 'self'; "
"img-src 'self' https://img.youtube.com data:; "
"script-src 'self' 'unsafe-inline'; "
"style-src 'self' 'unsafe-inline';"
)
return response
# Route: Home page
@app.route("/")
def index():
return render_template("index.html")
# Health check (fast) β€” for HF Spaces readiness
@app.get("/health")
def health():
return {"ok": True}, 200
# Template filter: convert HH:MM:SS to seconds
@app.template_filter("jump_time")
def jump_time(timestamp):
try:
h, m, s = timestamp.split(':')
total = int(h) * 3600 + int(m) * 60 + int(float(s))
return max(total - 2, 0)
except:
return 0
# NLP helpers: lemmatizer + synonym expansion
lemmatizer = WordNetLemmatizer()
def get_synonyms(word):
"""Return a set of synonyms for a single word."""
synonyms = set()
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.add(lemma.name().replace("_", " "))
return synonyms
# Highlighting:
def highlight_keywords(text, keyword, semantic_mode=False):
"""
Highlight exact matches always.
In semantic mode, also highlight synonyms and lemmas.
"""
safe_text = escape(text)
if len(keyword) <= 3:
pattern = re.compile(rf"(?<!\w){re.escape(keyword)}(?!\w)", re.IGNORECASE)
else:
pattern = re.compile(re.escape(keyword), re.IGNORECASE)
if pattern.search(safe_text):
return pattern.sub(lambda m: f"<mark>{m.group(0)}</mark>", safe_text)
# Semantic mode
if semantic_mode:
words = keyword.split()
for w in words:
lemma = lemmatizer.lemmatize(w.lower())
candidates = {lemma} | get_synonyms(w)
for cand in candidates:
if len(cand) <= 3:
syn_pattern = re.compile(rf"(?<!\w){re.escape(cand)}(?!\w)", re.IGNORECASE)
else:
syn_pattern = re.compile(rf"\b{re.escape(cand)}\b", re.IGNORECASE)
if syn_pattern.search(safe_text):
safe_text = syn_pattern.sub(lambda m: f"<mark>{m.group(0)}</mark>", safe_text)
return safe_text
return safe_text
# Core search orchestration
def perform_search(query, start=0, shown=0, previous_results=None, semantic_mode=False):
"""Shared search logic for both HTML and JSON endpoints."""
if previous_results is None:
previous_results = []
# πŸ”Έ Lazy imports so heavy modules load on first search, not at boot
from semantic_search import search_query
from nlp_summary import summarize_text
raw_results, _ = search_query(query, offset=0, top_k=1000, semantic_mode=semantic_mode)
# Keyword mode
if not semantic_mode:
raw_results = [r for r in raw_results if re.search(re.escape(query), r["text"], re.IGNORECASE)]
page_size = SEARCH_CONFIG.get("results_per_page", 5)
paged_results = raw_results[start:start + page_size]
new_results = []
for idx, r in enumerate(paged_results):
vid_id = r.get("video_id")
friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid_id), None)
r["video_title"] = VIDEO_METADATA.get(friendly_key, {}).get("title", "Unknown Title")
context_chunks = []
if idx > 0:
context_chunks.append(paged_results[idx - 1]["summary_input"])
context_chunks.append(r["summary_input"])
if idx + 1 < len(paged_results):
context_chunks.append(paged_results[idx + 1]["summary_input"])
summary = summarize_text(" ".join(context_chunks), query=query)
highlighted_before = highlight_keywords(r["context_before"], query, semantic_mode)
highlighted_match = highlight_keywords(r["text"], query, semantic_mode)
highlighted_after = highlight_keywords(r["context_after"], query, semantic_mode)
r["highlighted_block"] = Markup(f"{highlighted_before}\n{highlighted_match}\n{highlighted_after}")
r["summary"] = summary
new_results.append(r)
combined_results = previous_results + new_results
shown += len(new_results)
return combined_results, len(raw_results), shown, start + len(new_results)
# HTML endpoint
@app.route("/search", methods=["POST"])
def search():
query = request.form.get("query", "").strip()
if not query:
return render_template("index.html", error="Please enter a search query.")
semantic_mode = request.form.get("semantic") == "true"
start = int(request.form.get("start", 0))
try:
previous_results = flask_json.loads(request.form.get("previous_results", "[]"))
except:
previous_results = []
for r in previous_results:
if isinstance(r, dict) and "highlighted_block" in r:
r["highlighted_block"] = Markup(r["highlighted_block"])
shown = int(request.form.get("shown", len(previous_results)))
combined_results, total_matches, shown, next_start = perform_search(
query, start, shown, previous_results, semantic_mode
)
# Abbreviation
suggestion_term = ""
lower_query = query.lower()
if lower_query in ABBREVIATION_MAP:
suggestion_term = ABBREVIATION_MAP[lower_query]
elif lower_query in ABBREVIATION_MAP.values():
for abbr, full in ABBREVIATION_MAP.items():
if full == lower_query:
suggestion_term = abbr
break
return render_template(
"results.html",
query=query,
results=combined_results,
shown=shown,
start=next_start,
total_matches=total_matches,
previous_results=combined_results,
suggestion_term=suggestion_term,
semantic=semantic_mode
)
# JSON API endpoint
@app.route("/api/search", methods=["POST"])
def api_search():
data = request.get_json(force=True)
query = data.get("query", "").strip()
semantic_mode = bool(data.get("semantic", False))
start = int(data.get("start", 0))
shown = int(data.get("shown", 0))
previous_results = data.get("previous_results", [])
combined_results, total_matches, shown, next_start = perform_search(
query, start, shown, previous_results, semantic_mode
)
rendered_cards = [
render_template("_result_card.html", result=r, query=query, semantic=semantic_mode)
for r in combined_results[-SEARCH_CONFIG.get("results_per_page", 5):]
]
return jsonify({
"html": rendered_cards,
"shown": shown,
"total_matches": total_matches,
"next_start": next_start,
"has_more": next_start < total_matches
})
# Autocomplete endpoint
@app.route("/autocomplete", methods=["GET"])
def autocomplete():
term = request.args.get("term", "")
return flask_json.dumps(get_suggestions(term))
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860)) # HF Spaces default
app.run(host="0.0.0.0", port=port, debug=False)