import gradio as gr
import torch
from PIL import Image
import json
import os
from transformers import AutoProcessor, AutoModelForImageTextToText
from typing import List, Dict, Any
import logging
import spaces

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Model configuration
MODEL_ID = "Tonic/l-android-control"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Get Hugging Face token from environment variable (Spaces secrets)
import os
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.")

class LOperatorDemo:
    def __init__(self):
        self.model = None
        self.processor = None
        self.is_loaded = False
        
    def load_model(self):
        """Load the L-Operator model and processor"""
        try:
            logger.info(f"Loading model {MODEL_ID} on device {DEVICE}")
            
            # Check if token is available
            if not HF_TOKEN:
                return "❌ HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets."
            
            try:
                # Try loading with standard approach
                self.processor = AutoProcessor.from_pretrained(
                    MODEL_ID, 
                    trust_remote_code=True,
                    token=HF_TOKEN
                )
                
                self.model = AutoModelForImageTextToText.from_pretrained(
                    MODEL_ID,
                    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
                    trust_remote_code=True,
                    device_map="auto" if DEVICE == "cuda" else None,
                    token=HF_TOKEN
                )
                
            except Exception as e:
                logger.warning(f"Standard loading failed: {str(e)}")
                logger.info("Attempting fallback loading approach...")
                
                # Fallback: try loading with explicit model type
                self.processor = AutoProcessor.from_pretrained(
                    MODEL_ID, 
                    trust_remote_code=True,
                    token=HF_TOKEN,
                    revision="main"
                )
                
                self.model = AutoModelForImageTextToText.from_pretrained(
                    MODEL_ID,
                    torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
                    trust_remote_code=True,
                    device_map="auto" if DEVICE == "cuda" else None,
                    token=HF_TOKEN,
                    revision="main",
                    ignore_mismatched_sizes=True
                )
            
            if DEVICE == "cpu":
                self.model = self.model.to(DEVICE)
            
            self.is_loaded = True
            logger.info("Model loaded successfully with token authentication")
            return "✅ Model loaded successfully with token authentication!"
            
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return f"❌ Error loading model: {str(e)} - This may be a custom model requiring special handling"
    
    @spaces.GPU(duration=120)  # 2 minutes for action generation
    def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str:
        """Generate action based on image and text inputs"""
        if not self.is_loaded:
            return "❌ Model not loaded. Please load the model first."
        
        try:
            # Convert image to RGB if needed
            if image.mode != "RGB":
                image = image.convert("RGB")
            
            # Build conversation
            conversation = [
                {
                    "role": "system",
                    "content": [
                        {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."}
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."}
                    ]
                }
            ]
            
            # Process inputs
            inputs = self.processor.apply_chat_template(
                conversation,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(self.model.device)
            
            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=128,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9
                )
            
            response = self.processor.tokenizer.decode(
                outputs[0][inputs.shape[1]:], 
                skip_special_tokens=True
            )
            
            # Try to parse as JSON for better formatting
            try:
                parsed_response = json.loads(response)
                return json.dumps(parsed_response, indent=2)
            except:
                return response
                
        except Exception as e:
            logger.error(f"Error generating action: {str(e)}")
            return f"❌ Error generating action: {str(e)}"
    
    @spaces.GPU(duration=90)  # 1.5 minutes for chat responses
    def chat_with_model(self, message: str, history: List[Dict[str, str]], image: Image.Image = None) -> List[Dict[str, str]]:
        """Chat interface function for Gradio"""
        if not self.is_loaded:
            return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Model not loaded. Please load the model first."}]
        
        if image is None:
            return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please upload an Android screenshot image."}]
        
        try:
            # Extract goal and instruction from message
            if "Goal:" in message and "Step:" in message:
                # Parse structured input
                lines = message.split('\n')
                goal = ""
                instruction = ""
                
                for line in lines:
                    if line.startswith("Goal:"):
                        goal = line.replace("Goal:", "").strip()
                    elif line.startswith("Step:"):
                        instruction = line.replace("Step:", "").strip()
                
                if not goal or not instruction:
                    return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please provide both Goal and Step in your message."}]
            else:
                # Treat as general instruction
                goal = "Complete the requested action"
                instruction = message
            
            # Generate action
            response = self.generate_action(image, goal, instruction)
            return history + [{"role": "user", "content": message}, {"role": "assistant", "content": response}]
            
        except Exception as e:
            logger.error(f"Error in chat: {str(e)}")
            return history + [{"role": "user", "content": message}, {"role": "assistant", "content": f"❌ Error: {str(e)}"}]

# Initialize demo
demo_instance = LOperatorDemo()

# Auto-load the model on startup
def auto_load_model():
    """Auto-load the model when the application starts"""
    try:
        logger.info("Auto-loading L-Operator model on startup...")
        result = demo_instance.load_model()
        logger.info(f"Auto-load result: {result}")
        return result
    except Exception as e:
        logger.error(f"Error auto-loading model: {str(e)}")
        return f"❌ Error auto-loading model: {str(e)}"

# Load model automatically (this happens during import)
print("🚀 Auto-loading L-Operator model on startup...")
auto_load_model()
print("✅ Model loading completed!")

# Load example episodes
def load_example_episodes():
    """Load example episodes from the extracted data"""
    examples = []
    
    try:
        # Load episode 13
        with open("extracted_episodes_duckdb/episode_13/metadata.json", "r") as f:
            episode_13 = json.load(f)
        
        # Load episode 53  
        with open("extracted_episodes_duckdb/episode_53/metadata.json", "r") as f:
            episode_53 = json.load(f)
        
        # Load episode 73
        with open("extracted_episodes_duckdb/episode_73/metadata.json", "r") as f:
            episode_73 = json.load(f)
        
        # Create examples with simple identifiers
        examples = [
            [
                "extracted_episodes_duckdb/episode_13/screenshots/screenshot_1.png",
                "Episode 13: Navigate app interface"
            ],
            [
                "extracted_episodes_duckdb/episode_53/screenshots/screenshot_1.png", 
                "Episode 53: App interaction example"
            ],
            [
                "extracted_episodes_duckdb/episode_73/screenshots/screenshot_1.png",
                "Episode 73: Device control task"
            ]
        ]
        
    except Exception as e:
        logger.error(f"Error loading examples: {str(e)}")
        examples = []
    
    return examples

# Create Gradio interface
def create_demo():
    """Create the Gradio demo interface"""
    
    with gr.Blocks(
        title="L-Operator: Android Device Control Demo",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            max-width: 1200px !important;
        }
        .chat-container {
            height: 600px;
        }
        """
    ) as demo:
        
        gr.Markdown("""
        # 🤖 L-Operator: Android Device Control Demo
        
        **Lightweight Multimodal Android Device Control Agent**
        
        This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, 
        optimized for Android device control through visual understanding and action generation.
        
        ## 🚀 How to Use
        
        1. **Model Loading**: The L-Operator model loads automatically on startup
        2. **Upload Screenshot**: Upload an Android device screenshot
        3. **Provide Instructions**: Enter your goal and step instructions
        4. **Get Actions**: The model will generate JSON actions for Android device control
        
        ## 📋 Expected Output Format
        
        The model generates JSON actions in the following format:
        ```json
        {
          "action_type": "tap",
          "x": 540,
          "y": 1200,
          "text": "Settings",
          "app_name": "com.android.settings",
          "confidence": 0.92
        }
        ```
        
        ---
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 🤖 Model Status")
                model_status = gr.Textbox(
                    label="L-Operator Model",
                    value="🔄 Loading model on startup...",
                    interactive=False
                )
                
                gr.Markdown("### 📱 Input")
                image_input = gr.Image(
                    label="Android Screenshot",
                    type="pil",
                    height=400,
                    sources=["upload"]
                )
                
                gr.Markdown("### 📝 Instructions")
                goal_input = gr.Textbox(
                    label="Goal",
                    placeholder="e.g., Open the Settings app and navigate to Display settings",
                    lines=2
                )
                
                step_input = gr.Textbox(
                    label="Step Instruction",
                    placeholder="e.g., Tap on the Settings app icon on the home screen",
                    lines=2
                )
                
                generate_btn = gr.Button("🎯 Generate Action", variant="secondary")
                
            with gr.Column(scale=2):
                gr.Markdown("### 💬 Chat Interface")
                chat_interface = gr.ChatInterface(
                    fn=demo_instance.chat_with_model,
                    additional_inputs=[image_input],
                    title="L-Operator Chat",
                    description="Chat with L-Operator using screenshots and text instructions",
                    examples=load_example_episodes(),
                    type="messages"
                )
                
                gr.Markdown("### 🎯 Action Output")
                action_output = gr.JSON(
                    label="Generated Action",
                    value={},
                    height=200
                )
        
        # Event handlers
        def on_generate_action(image, goal, step):
            if not image:
                return {"error": "Please upload an image"}
            
            if not goal or not step:
                return {"error": "Please provide both goal and step"}
            
            response = demo_instance.generate_action(image, goal, step)
            
            try:
                # Try to parse as JSON
                parsed = json.loads(response)
                return parsed
            except:
                return {"raw_response": response}
        
        # Update model status on page load
        def update_model_status():
            if demo_instance.is_loaded:
                return "✅ L-Operator model loaded and ready!"
            else:
                return "❌ Model failed to load. Please check logs."
        
        generate_btn.click(
            fn=on_generate_action,
            inputs=[image_input, goal_input, step_input],
            outputs=action_output
        )
        
        # Update model status on page load
        demo.load(
            fn=update_model_status,
            outputs=model_status
        )
        
        # Update chat interface when image changes
        def update_chat_image(image):
            return image
        
        image_input.change(
            fn=update_chat_image,
            inputs=[image_input],
            outputs=[chat_interface.chatbot]
        )
        
        gr.Markdown("""
        ---
        
        ## 📊 Model Details
        
        | Property | Value |
        |----------|-------|
        | **Base Model** | LiquidAI/LFM2-VL-1.6B |
        | **Architecture** | LFM2-VL (1.6B parameters) |
        | **Fine-tuning** | LoRA (Low-Rank Adaptation) |
        | **Training Data** | Android control episodes with screenshots and actions |
        
        ## 🎯 Use Cases
        
        - **Mobile App Testing**: Automated UI testing for Android applications
        - **Accessibility Applications**: Voice-controlled device navigation
        - **Remote Support**: Remote device troubleshooting
        - **Development Workflows**: UI/UX testing automation

                            ---
        
        **Made with ❤️ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) 
        """)
    
    return demo

# Create and launch the demo
if __name__ == "__main__":
    demo = create_demo()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True,
        show_error=True,
        ssr_mode=False
    )