import gradio as gr
import torch
from PIL import Image
import json
import os
from transformers import AutoProcessor, AutoModelForImageTextToText
from typing import List, Dict, Any
import logging
import spaces

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Model configuration
MODEL_ID = "Tonic/l-operator"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Get Hugging Face token from environment variable (Spaces secrets)
import os
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.")
    logger.warning("Please set HF_TOKEN in your environment variables or Spaces secrets.")

class LOperatorDemo:
    def __init__(self):
        self.model = None
        self.processor = None
        self.is_loaded = False
        
    def load_model(self):
        """Load the L-Operator model and processor with timeout handling"""
        try:
            import time
            start_time = time.time()
            logger.info(f"Loading model {MODEL_ID} on device {DEVICE}")

            # Check if token is available
            if not HF_TOKEN:
                return "❌ HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets."

            # Load model with progress logging
            logger.info("Downloading and loading model weights...")
            self.model = AutoModelForImageTextToText.from_pretrained(
                MODEL_ID,
                device_map="auto",
                torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
                trust_remote_code=True
            )

            # Load processor
            logger.info("Loading processor...")
            self.processor = AutoProcessor.from_pretrained(
                MODEL_ID,
                trust_remote_code=True
            )

            if DEVICE == "cpu":
                self.model = self.model.to(DEVICE)

            self.is_loaded = True
            load_time = time.time() - start_time
            logger.info(f"Model loaded successfully in {load_time:.1f} seconds")
            return f"✅ Model loaded successfully in {load_time:.1f} seconds"

        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return f"❌ Error loading model: {str(e)} - This may be a custom model requiring special handling"
    
    @spaces.GPU(duration=120)  # 2 minutes for action generation
    def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str:
        """Generate action based on image and text inputs"""
        if not self.is_loaded:
            return "❌ Model not loaded. Please load the model first."
        
        try:
            # Convert image to RGB if needed
            if image.mode != "RGB":
                image = image.convert("RGB")
            
            # Build conversation
            conversation = [
                {
                    "role": "system",
                    "content": [
                        {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."}
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."}
                    ]
                }
            ]
            
            logger.info("Processing conversation with processor...")
            
            # Process inputs with better error handling
            try:
                inputs = self.processor.apply_chat_template(
                    conversation,
                    add_generation_prompt=True,
                    return_tensors="pt"
                )
                logger.info(f"Processor output type: {type(inputs)}")
                
                # Ensure inputs is a tensor and move to correct device
                if not isinstance(inputs, torch.Tensor):
                    logger.warning("apply_chat_template did not return a tensor, attempting to convert...")
                    if isinstance(inputs, (list, tuple)):
                        inputs = torch.tensor(inputs)
                    else:
                        # If it's a string or other type, we need to handle it differently
                        logger.error(f"Unexpected input type: {type(inputs)}, value: {inputs}")
                        return "❌ Error: Processor returned unexpected format"
                
                inputs = inputs.to(self.model.device)
                logger.info(f"Inputs shape: {inputs.shape}, device: {inputs.device}")
                
            except Exception as e:
                logger.error(f"Error in processor: {str(e)}")
                return f"❌ Error in processor: {str(e)}"
            
            # Generate response
            logger.info("Generating response...")
            with torch.no_grad():
                outputs = self.model.generate(
                    inputs,
                    max_new_tokens=128,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9
                )
            
            logger.info("Decoding response...")
            response = self.processor.tokenizer.decode(
                outputs[0][inputs.shape[1]:], 
                skip_special_tokens=True
            )
            
            # Try to parse as JSON for better formatting
            try:
                parsed_response = json.loads(response)
                return json.dumps(parsed_response, indent=2)
            except:
                return response
                
        except Exception as e:
            logger.error(f"Error generating action: {str(e)}")
            return f"❌ Error generating action: {str(e)}"
    

# Initialize demo
demo_instance = LOperatorDemo()

def process_input(image, goal):
    """Process the input and generate action"""
    if image is None:
        return "❌ Please upload an Android screenshot image."
    
    if not goal.strip():
        return "❌ Please provide a goal."
    
    if not demo_instance.is_loaded:
        return "❌ Model not loaded. Please wait for it to load automatically."
    
    try:
        # Handle different image formats
        pil_image = None
        if hasattr(image, 'mode'):  # PIL Image object
            pil_image = image
        elif isinstance(image, str) and os.path.exists(image):
            # Handle file path (from examples)
            pil_image = Image.open(image)
        elif hasattr(image, 'name') and os.path.exists(image.name):
            # Handle Gradio file object
            pil_image = Image.open(image.name)
        else:
            return "❌ Invalid image format. Please upload a valid image."
        
        if pil_image is None:
            return "❌ Failed to process image. Please try again."
        
        # Convert image to RGB if needed
        if pil_image.mode != "RGB":
            pil_image = pil_image.convert("RGB")
        
        # Generate action using goal as both goal and instruction
        response = demo_instance.generate_action(pil_image, goal, goal)
        return response
        
    except Exception as e:
        logger.error(f"Error processing input: {str(e)}")
        return f"❌ Error: {str(e)}"


def load_example_episodes():
    """Load example episodes using PIL to load images directly"""
    examples = []

    try:
        episode_dirs = ["episode_13", "episode_53", "episode_73"]

        for episode_dir in episode_dirs:
            try:
                metadata_path = f"extracted_episodes_duckdb/{episode_dir}/metadata.json"
                image_path = f"extracted_episodes_duckdb/{episode_dir}/screenshots/screenshot_1.png"
                
                # Check if both files exist
                if os.path.exists(metadata_path) and os.path.exists(image_path):
                    logger.info(f"Loading example from {episode_dir}")
                    
                    with open(metadata_path, "r") as f:
                        metadata = json.load(f)
                    
                    # Load image directly with PIL
                    pil_image = Image.open(image_path)
                    
                    episode_num = episode_dir.split('_')[1]
                    goal_text = metadata.get('goal', f'Episode {episode_num} example')
                    
                    logger.info(f"Episode {episode_num} goal: {goal_text}")
                    
                    examples.append([
                        pil_image,  # Use PIL Image object directly
                        goal_text  # Use the goal text from metadata
                    ])
                    logger.info(f"Successfully loaded example for Episode {episode_num}")
                    
            except Exception as e:
                logger.warning(f"Could not load example for {episode_dir}: {str(e)}")
                continue

    except Exception as e:
        logger.error(f"Error loading examples: {str(e)}")
        examples = []

    logger.info(f"Loaded {len(examples)} examples using PIL")
    return examples

# Create Gradio interface
def create_demo():
    """Create the Gradio demo interface using Blocks"""
    
    with gr.Blocks(
        title="L-Operator: Android Device Control Demo",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {
            max-width: 1200px !important;
        }
        .output-container {
            min-height: 200px;
        }
        """
    ) as demo:
        
        gr.Markdown("""
        # 🤖 L-Operator: Android Device Control Demo
        
        **Lightweight Multimodal Android Device Control Agent**
        
        This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, 
        optimized for Android device control through visual understanding and action generation.
        
        ## 🚀 How to Use
        
        1. **Upload Screenshot**: Upload an Android device screenshot
        2. **Describe Goal**: Enter what you want to accomplish
        3. **Get Actions**: The model will generate JSON actions for Android device control
        
        ## 📋 Expected Output Format
        
        The model generates JSON actions in the following format:
        ```json
        {
          "action_type": "tap",
          "x": 540,
          "y": 1200,
          "text": "Settings",
          "app_name": "com.android.settings",
          "confidence": 0.92
        }
        ```
        
        ---
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("### 📱 Upload Screenshot")
                image_input = gr.Image(
                    label="Android Screenshot",
                    type="pil",
                    height=400
                )
                
                gr.Markdown("### 🎯 Goal")
                goal_input = gr.Textbox(
                    label="What would you like to accomplish?",
                    placeholder="e.g., Open the Settings app and navigate to Display settings",
                    lines=3
                )
                
                # Process button
                process_btn = gr.Button("🚀 Generate Action", variant="primary", size="lg")
                
            with gr.Column(scale=1):
                gr.Markdown("### 📊 Generated Action")
                output_text = gr.Textbox(
                    label="JSON Action Output",
                    lines=15,
                    max_lines=20,
                    interactive=False,
                    elem_classes=["output-container"]
                )
        
        # Connect the process button
        process_btn.click(
            fn=process_input,
            inputs=[image_input, goal_input],
            outputs=output_text
        )
        
        # Load examples
        gr.Markdown("### 📚 Example Episodes")
        try:
            examples = load_example_episodes()
            if examples:
                with gr.Row():
                    for i, (image, goal) in enumerate(examples):
                        with gr.Column(scale=1):
                            gr.Markdown(f"**Episode {i+1}**")
                            example_image = gr.Image(
                                value=image,
                                label=f"Example {i+1}",
                                height=200,
                                interactive=False
                            )
                            example_goal = gr.Textbox(
                                value=goal,
                                label="Goal",
                                lines=2,
                                interactive=False
                            )
                            # Create a button to load this example
                            load_example_btn = gr.Button(f"Load Example {i+1}", size="sm")
                            load_example_btn.click(
                                fn=lambda img, g: (img, g),
                                inputs=[example_image, example_goal],
                                outputs=[image_input, goal_input]
                            )
        except Exception as e:
            logger.warning(f"Failed to load examples: {str(e)}")
            gr.Markdown("❌ Failed to load examples. Please upload your own screenshot.")
        
        # Load model automatically on startup
        def load_model_on_startup():
            """Load model automatically without user feedback"""
            if not demo_instance.is_loaded:
                logger.info("Loading L-Operator model automatically...")
                try:
                    demo_instance.load_model()
                    logger.info("Model loaded successfully in background")
                except Exception as e:
                    logger.error(f"Failed to load model: {str(e)}")

        # Load model automatically on page load
        demo.load(fn=load_model_on_startup)
        
        gr.Markdown("""
        ---
        
        ## 📊 Model Details
        
        | Property | Value |
        |----------|-------|
        | **Base Model** | LiquidAI/LFM2-VL-1.6B |
        | **Architecture** | LFM2-VL (1.6B parameters) |
        | **Fine-tuning** | LoRA (Low-Rank Adaptation) |
        | **Training Data** | Android control episodes with screenshots and actions |
        
        ## 🎯 Use Cases
        
        - **Mobile App Testing**: Automated UI testing for Android applications
        - **Accessibility Applications**: Voice-controlled device navigation
        - **Remote Support**: Remote device troubleshooting
        - **Development Workflows**: UI/UX testing automation

        ---
        
        **Made with ❤️ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) 
        """)
    
    return demo

# Create and launch the demo with optimized settings
if __name__ == "__main__":
    try:
        logger.info("Creating Gradio demo interface...")
        demo = create_demo()

        logger.info("Launching Gradio server...")
        demo.launch(
            server_name="0.0.0.0",
            server_port=7860,
            share=False,
            debug=False,  # Disable debug to reduce startup time
            show_error=True,
            ssr_mode=False,
            max_threads=2,  # Limit threads to prevent resource exhaustion
            quiet=True  # Reduce startup logging noise
        )
    except Exception as e:
        logger.error(f"Failed to launch Gradio app: {str(e)}")
        raise