import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'

import gradio as gr
from sentence_transformers import SentenceTransformer
import numpy as np
from pypdf import PdfReader
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re

# Global variables
chunks = []
embeddings = []
model = None
tokenizer = None
embed_model = None
text_cache = ""

def initialize_models():
    """Initialize models on startup with optimizations"""
    global model, tokenizer, embed_model
    
    print("Loading models...")
    
    # Use smaller, faster embedding model
    embed_model = SentenceTransformer(
        'sentence-transformers/paraphrase-MiniLM-L3-v2',  # Faster, smaller model
        device='cpu'
    )
    
    # Use smaller, faster language model
    model_name = "microsoft/phi-1_5"  # Much faster than TinyLlama, better quality
    # Alternative: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    
    # Set padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("Models loaded successfully!")

def smart_chunk_text(text, chunk_size=500, overlap=100):
    """Smarter chunking that respects sentence boundaries"""
    # Split into sentences
    sentences = re.split(r'[.!?]+', text)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # If adding this sentence exceeds chunk size, save current chunk
        if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
            chunks.append(current_chunk)
            # Start new chunk with overlap
            words = current_chunk.split()
            current_chunk = " ".join(words[-20:]) + " " + sentence
        else:
            current_chunk += " " + sentence
    
    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def process_pdf(pdf_file):
    """Process PDF and create embeddings - OPTIMIZED"""
    global chunks, embeddings, embed_model, text_cache
    
    if pdf_file is None:
        return "❌ Please upload a PDF file!", None
    
    try:
        # Read PDF
        pdf_reader = PdfReader(pdf_file.name)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
        
        if not text.strip():
            return "❌ Could not extract text from PDF!", None
        
        text_cache = text  # Cache for faster reprocessing
        
        # Smart chunking (smaller chunks = faster embedding)
        chunks = smart_chunk_text(text, chunk_size=500, overlap=100)
        
        # Batch encode for speed
        print(f"Creating embeddings for {len(chunks)} chunks...")
        embeddings = embed_model.encode(
            chunks,
            batch_size=32,  # Process multiple chunks at once
            show_progress_bar=False,
            convert_to_numpy=True
        )
        
        return f"✅ PDF processed! Created {len(chunks)} chunks. You can now ask questions!", None
        
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        return f"❌ Error: {str(e)}", None

def find_relevant_chunks(query, top_k=2):  # Reduced from 3 to 2 for speed
    """Find most relevant chunks - OPTIMIZED"""
    global chunks, embeddings, embed_model
    
    if not chunks or len(embeddings) == 0:
        return []
    
    # Encode query
    query_embedding = embed_model.encode(
        [query],
        convert_to_numpy=True,
        show_progress_bar=False
    )[0]
    
    # Fast cosine similarity using numpy
    embeddings_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    query_norm = query_embedding / np.linalg.norm(query_embedding)
    similarities = np.dot(embeddings_norm, query_norm)
    
    # Get top k indices
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    return [chunks[i] for i in top_indices]

def generate_response(question, context):
    """Generate response - OPTIMIZED"""
    global model, tokenizer
    
    # Shorter, more efficient prompt
    prompt = f"""Context: {context[:800]}

Question: {question}

Answer:"""
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024  # Reduced from 2048
    )
    
    # Faster generation settings
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,  # Reduced from 300
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            num_beams=1,  # Greedy search for speed
            early_stopping=True
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract answer
    if "Answer:" in response:
        response = response.split("Answer:")[-1].strip()
    
    # Clean up response
    response = response.split("\n")[0].strip()  # Take first line
    
    return response

def chat(message, history):
    """Handle chat - OPTIMIZED"""
    global chunks
    
    if not chunks:
        return history + [[message, "⚠️ Please upload and process a PDF first!"]]
    
    if not message.strip():
        return history
    
    try:
        # Find relevant context (reduced chunks)
        relevant_chunks = find_relevant_chunks(message, top_k=2)
        context = " ".join(relevant_chunks)
        
        # Generate response
        response = generate_response(message, context)
        
        # Ensure response is not empty
        if not response or len(response) < 10:
            response = "I found relevant information but couldn't generate a clear answer. Please try rephrasing your question."
        
        return history + [[message, response]]
        
    except Exception as e:
        print(f"Error in chat: {str(e)}")
        return history + [[message, f"❌ Error: {str(e)}"]]

def clear_all():
    """Clear everything"""
    global chunks, embeddings, text_cache
    chunks = []
    embeddings = []
    text_cache = ""
    return None, "Ready to process a new PDF"

# Create UI with better styling
with gr.Blocks(title="Chat with PDF - Fast", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ⚡ Chat with PDF - Optimized Fast Version")
    gr.Markdown("*Using lightweight models for faster responses*")
    
    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(
                label="📎 Upload PDF",
                file_types=[".pdf"]
            )
            process_btn = gr.Button(
                "🔄 Process PDF",
                variant="primary",
                size="lg"
            )
            status = gr.Textbox(
                label="Status",
                lines=2,
                interactive=False
            )
            
            gr.Markdown("### Tips:")
            gr.Markdown("""
            - Processing is much faster now!
            - Ask specific questions
            - Keep questions concise
            """)
            
            clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
            
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(
                label="💬 Chat",
                height=450,
                bubble_full_width=False
            )
            msg = gr.Textbox(
                label="Question",
                placeholder="Ask a question about the PDF...",
                lines=2
            )
            with gr.Row():
                send_btn = gr.Button("📤 Send", variant="primary")
                clear_btn = gr.Button("Clear Chat")
    
    # Events
    process_btn.click(
        process_pdf,
        inputs=[pdf_input],
        outputs=[status, chatbot]
    )
    
    msg.submit(
        chat,
        inputs=[msg, chatbot],
        outputs=[chatbot]
    ).then(
        lambda: "",
        None,
        [msg]
    )
    
    send_btn.click(
        chat,
        inputs=[msg, chatbot],
        outputs=[chatbot]
    ).then(
        lambda: "",
        None,
        [msg]
    )
    
    clear_btn.click(lambda: None, None, [chatbot])
    clear_all_btn.click(clear_all, None, [chatbot, status])

# Initialize on startup
initialize_models()

if __name__ == "__main__":
    demo.queue()  # Enable queuing for better performance
    demo.launch(share=False)