Spaces:

NIKKI77
/

ks-version-1-1

Sleeping

File size: 7,794 Bytes

# Flask app for Subtitle KIS — main routes + search flow 
import os
import re
import json as flask_json
from flask import Flask, render_template, request, jsonify
from markupsafe import escape, Markup
from nltk.corpus import wordnet 
from nltk.stem import WordNetLemmatizer 
# NOTE: heavy imports moved to lazy inside perform_search()
# from semantic_search import search_query
# from nlp_summary import summarize_text
from autocomplete import get_suggestions
from config import ABBREVIATION_MAP, VIDEO_METADATA, SEARCH_CONFIG

# App setup
template_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'templates')
app = Flask(__name__, template_folder=template_dir)

# Security headers:Content Security Policy 
@app.after_request
def apply_csp(response):
    response.headers["Content-Security-Policy"] = (
        "default-src 'self'; "
        "img-src 'self' https://img.youtube.com data:; "
        "script-src 'self' 'unsafe-inline'; "
        "style-src 'self' 'unsafe-inline';"
    )
    return response

# Route: Home page 
@app.route("/")
def index():
    return render_template("index.html")

# Health check (fast) — for HF Spaces readiness
@app.get("/health")
def health():
    return {"ok": True}, 200

# Template filter: convert HH:MM:SS to seconds
@app.template_filter("jump_time")
def jump_time(timestamp):
    try:
        h, m, s = timestamp.split(':')
        total = int(h) * 3600 + int(m) * 60 + int(float(s))
        return max(total - 2, 0)
    except:
        return 0

# NLP helpers: lemmatizer + synonym expansion
lemmatizer = WordNetLemmatizer()

def get_synonyms(word):
    """Return a set of synonyms for a single word."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return synonyms

# Highlighting: 
def highlight_keywords(text, keyword, semantic_mode=False):
    """
    Highlight exact matches always.
    In semantic mode, also highlight synonyms and lemmas.
    """
    safe_text = escape(text)

    if len(keyword) <= 3:
        pattern = re.compile(rf"(?<!\w){re.escape(keyword)}(?!\w)", re.IGNORECASE)
    else:
        pattern = re.compile(re.escape(keyword), re.IGNORECASE)

    if pattern.search(safe_text):
        return pattern.sub(lambda m: f"<mark>{m.group(0)}</mark>", safe_text)

    # Semantic mode
    if semantic_mode:
        words = keyword.split()
        for w in words:
            lemma = lemmatizer.lemmatize(w.lower())
            candidates = {lemma} | get_synonyms(w)
            for cand in candidates:
                if len(cand) <= 3:
                    syn_pattern = re.compile(rf"(?<!\w){re.escape(cand)}(?!\w)", re.IGNORECASE)
                else:
                    syn_pattern = re.compile(rf"\b{re.escape(cand)}\b", re.IGNORECASE)
                if syn_pattern.search(safe_text):
                    safe_text = syn_pattern.sub(lambda m: f"<mark>{m.group(0)}</mark>", safe_text)
        return safe_text

    return safe_text

# Core search orchestration
def perform_search(query, start=0, shown=0, previous_results=None, semantic_mode=False):
    """Shared search logic for both HTML and JSON endpoints."""
    if previous_results is None:
        previous_results = []

    # 🔸 Lazy imports so heavy modules load on first search, not at boot
    from semantic_search import search_query
    from nlp_summary import summarize_text

    raw_results, _ = search_query(query, offset=0, top_k=1000, semantic_mode=semantic_mode)

    # Keyword mode
    if not semantic_mode:
        raw_results = [r for r in raw_results if re.search(re.escape(query), r["text"], re.IGNORECASE)]

    page_size = SEARCH_CONFIG.get("results_per_page", 5)
    paged_results = raw_results[start:start + page_size]

    new_results = []
    for idx, r in enumerate(paged_results):
        vid_id = r.get("video_id")
        friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid_id), None)
        r["video_title"] = VIDEO_METADATA.get(friendly_key, {}).get("title", "Unknown Title")

        context_chunks = []
        if idx > 0:
            context_chunks.append(paged_results[idx - 1]["summary_input"])
        context_chunks.append(r["summary_input"])
        if idx + 1 < len(paged_results):
            context_chunks.append(paged_results[idx + 1]["summary_input"])

        summary = summarize_text(" ".join(context_chunks), query=query)

        highlighted_before = highlight_keywords(r["context_before"], query, semantic_mode)
        highlighted_match = highlight_keywords(r["text"], query, semantic_mode)
        highlighted_after = highlight_keywords(r["context_after"], query, semantic_mode)

        r["highlighted_block"] = Markup(f"{highlighted_before}\n{highlighted_match}\n{highlighted_after}")
        r["summary"] = summary
        new_results.append(r)

    combined_results = previous_results + new_results
    shown += len(new_results)

    return combined_results, len(raw_results), shown, start + len(new_results)

# HTML endpoint
@app.route("/search", methods=["POST"])
def search():
    query = request.form.get("query", "").strip()
    if not query:
        return render_template("index.html", error="Please enter a search query.")

    semantic_mode = request.form.get("semantic") == "true"
    start = int(request.form.get("start", 0))
    try:
        previous_results = flask_json.loads(request.form.get("previous_results", "[]"))
    except:
        previous_results = []

    for r in previous_results:
        if isinstance(r, dict) and "highlighted_block" in r:
            r["highlighted_block"] = Markup(r["highlighted_block"])

    shown = int(request.form.get("shown", len(previous_results)))

    combined_results, total_matches, shown, next_start = perform_search(
        query, start, shown, previous_results, semantic_mode
    )

    # Abbreviation 
    suggestion_term = ""
    lower_query = query.lower()
    if lower_query in ABBREVIATION_MAP:
        suggestion_term = ABBREVIATION_MAP[lower_query]
    elif lower_query in ABBREVIATION_MAP.values():
        for abbr, full in ABBREVIATION_MAP.items():
            if full == lower_query:
                suggestion_term = abbr
                break

    return render_template(
        "results.html",
        query=query,
        results=combined_results,
        shown=shown,
        start=next_start,
        total_matches=total_matches,
        previous_results=combined_results,
        suggestion_term=suggestion_term,
        semantic=semantic_mode
    )

# JSON API endpoint
@app.route("/api/search", methods=["POST"])
def api_search():
    data = request.get_json(force=True)
    query = data.get("query", "").strip()
    semantic_mode = bool(data.get("semantic", False))
    start = int(data.get("start", 0))
    shown = int(data.get("shown", 0))
    previous_results = data.get("previous_results", [])

    combined_results, total_matches, shown, next_start = perform_search(
        query, start, shown, previous_results, semantic_mode
    )

    rendered_cards = [
        render_template("_result_card.html", result=r, query=query, semantic=semantic_mode)
        for r in combined_results[-SEARCH_CONFIG.get("results_per_page", 5):]
    ]

    return jsonify({
        "html": rendered_cards,
        "shown": shown,
        "total_matches": total_matches,
        "next_start": next_start,
        "has_more": next_start < total_matches
    })

# Autocomplete endpoint
@app.route("/autocomplete", methods=["GET"])
def autocomplete():
    term = request.args.get("term", "")
    return flask_json.dumps(get_suggestions(term))

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))  # HF Spaces default
    app.run(host="0.0.0.0", port=port, debug=False)