import gradio as gr import torch from PIL import Image import json import os from transformers import AutoProcessor, AutoModelForImageTextToText from typing import List, Dict, Any import logging import spaces # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model configuration MODEL_ID = "Tonic/l-operator" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Get Hugging Face token from environment variable (Spaces secrets) import os HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.") logger.warning("Please set HF_TOKEN in your environment variables or Spaces secrets.") class LOperatorDemo: def __init__(self): self.model = None self.processor = None self.is_loaded = False def load_model(self): """Load the L-Operator model and processor with timeout handling""" try: import time start_time = time.time() logger.info(f"Loading model {MODEL_ID} on device {DEVICE}") # Check if token is available if not HF_TOKEN: return "❌ HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets." # Load model with progress logging logger.info("Downloading and loading model weights...") self.model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, device_map="auto", torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, trust_remote_code=True ) # Load processor logger.info("Loading processor...") self.processor = AutoProcessor.from_pretrained( MODEL_ID, trust_remote_code=True ) if DEVICE == "cpu": self.model = self.model.to(DEVICE) self.is_loaded = True load_time = time.time() - start_time logger.info(f"Model loaded successfully in {load_time:.1f} seconds") return f"✅ Model loaded successfully in {load_time:.1f} seconds" except Exception as e: logger.error(f"Error loading model: {str(e)}") return f"❌ Error loading model: {str(e)} - This may be a custom model requiring special handling" @spaces.GPU(duration=120) # 2 minutes for action generation def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str: """Generate action based on image and text inputs""" if not self.is_loaded: return "❌ Model not loaded. Please load the model first." try: # Convert image to RGB if needed if image.mode != "RGB": image = image.convert("RGB") # Build conversation conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."} ] }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."} ] } ] logger.info("Processing conversation with processor...") # Process inputs with better error handling try: inputs = self.processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ) logger.info(f"Processor output type: {type(inputs)}") # Ensure inputs is a tensor and move to correct device if not isinstance(inputs, torch.Tensor): logger.warning("apply_chat_template did not return a tensor, attempting to convert...") if isinstance(inputs, (list, tuple)): inputs = torch.tensor(inputs) else: # If it's a string or other type, we need to handle it differently logger.error(f"Unexpected input type: {type(inputs)}, value: {inputs}") return "❌ Error: Processor returned unexpected format" inputs = inputs.to(self.model.device) logger.info(f"Inputs shape: {inputs.shape}, device: {inputs.device}") except Exception as e: logger.error(f"Error in processor: {str(e)}") return f"❌ Error in processor: {str(e)}" # Generate response logger.info("Generating response...") with torch.no_grad(): outputs = self.model.generate( inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.9 ) logger.info("Decoding response...") response = self.processor.tokenizer.decode( outputs[0][inputs.shape[1]:], skip_special_tokens=True ) # Try to parse as JSON for better formatting try: parsed_response = json.loads(response) return json.dumps(parsed_response, indent=2) except: return response except Exception as e: logger.error(f"Error generating action: {str(e)}") return f"❌ Error generating action: {str(e)}" # Initialize demo demo_instance = LOperatorDemo() def process_input(image, goal): """Process the input and generate action""" if image is None: return "❌ Please upload an Android screenshot image." if not goal.strip(): return "❌ Please provide a goal." if not demo_instance.is_loaded: return "❌ Model not loaded. Please wait for it to load automatically." try: # Handle different image formats pil_image = None if hasattr(image, 'mode'): # PIL Image object pil_image = image elif isinstance(image, str) and os.path.exists(image): # Handle file path (from examples) pil_image = Image.open(image) elif hasattr(image, 'name') and os.path.exists(image.name): # Handle Gradio file object pil_image = Image.open(image.name) else: return "❌ Invalid image format. Please upload a valid image." if pil_image is None: return "❌ Failed to process image. Please try again." # Convert image to RGB if needed if pil_image.mode != "RGB": pil_image = pil_image.convert("RGB") # Generate action using goal as both goal and instruction response = demo_instance.generate_action(pil_image, goal, goal) return response except Exception as e: logger.error(f"Error processing input: {str(e)}") return f"❌ Error: {str(e)}" def load_example_episodes(): """Load example episodes using PIL to load images directly""" examples = [] try: episode_dirs = ["episode_13", "episode_53", "episode_73"] for episode_dir in episode_dirs: try: metadata_path = f"extracted_episodes_duckdb/{episode_dir}/metadata.json" image_path = f"extracted_episodes_duckdb/{episode_dir}/screenshots/screenshot_1.png" # Check if both files exist if os.path.exists(metadata_path) and os.path.exists(image_path): logger.info(f"Loading example from {episode_dir}") with open(metadata_path, "r") as f: metadata = json.load(f) # Load image directly with PIL pil_image = Image.open(image_path) episode_num = episode_dir.split('_')[1] goal_text = metadata.get('goal', f'Episode {episode_num} example') logger.info(f"Episode {episode_num} goal: {goal_text}") examples.append([ pil_image, # Use PIL Image object directly goal_text # Use the goal text from metadata ]) logger.info(f"Successfully loaded example for Episode {episode_num}") except Exception as e: logger.warning(f"Could not load example for {episode_dir}: {str(e)}") continue except Exception as e: logger.error(f"Error loading examples: {str(e)}") examples = [] logger.info(f"Loaded {len(examples)} examples using PIL") return examples # Create Gradio interface def create_demo(): """Create the Gradio demo interface using Blocks""" with gr.Blocks( title="L-Operator: Android Device Control Demo", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } .output-container { min-height: 200px; } """ ) as demo: gr.Markdown(""" # 🤖 L-Operator: Android Device Control Demo **Lightweight Multimodal Android Device Control Agent** This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, optimized for Android device control through visual understanding and action generation. ## 🚀 How to Use 1. **Upload Screenshot**: Upload an Android device screenshot 2. **Describe Goal**: Enter what you want to accomplish 3. **Get Actions**: The model will generate JSON actions for Android device control ## 📋 Expected Output Format The model generates JSON actions in the following format: ```json { "action_type": "tap", "x": 540, "y": 1200, "text": "Settings", "app_name": "com.android.settings", "confidence": 0.92 } ``` --- """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 📱 Upload Screenshot") image_input = gr.Image( label="Android Screenshot", type="pil", height=400 ) gr.Markdown("### 🎯 Goal") goal_input = gr.Textbox( label="What would you like to accomplish?", placeholder="e.g., Open the Settings app and navigate to Display settings", lines=3 ) # Process button process_btn = gr.Button("🚀 Generate Action", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### 📊 Generated Action") output_text = gr.Textbox( label="JSON Action Output", lines=15, max_lines=20, interactive=False, elem_classes=["output-container"] ) # Connect the process button process_btn.click( fn=process_input, inputs=[image_input, goal_input], outputs=output_text ) # Load examples gr.Markdown("### 📚 Example Episodes") try: examples = load_example_episodes() if examples: with gr.Row(): for i, (image, goal) in enumerate(examples): with gr.Column(scale=1): gr.Markdown(f"**Episode {i+1}**") example_image = gr.Image( value=image, label=f"Example {i+1}", height=200, interactive=False ) example_goal = gr.Textbox( value=goal, label="Goal", lines=2, interactive=False ) # Create a button to load this example load_example_btn = gr.Button(f"Load Example {i+1}", size="sm") load_example_btn.click( fn=lambda img, g: (img, g), inputs=[example_image, example_goal], outputs=[image_input, goal_input] ) except Exception as e: logger.warning(f"Failed to load examples: {str(e)}") gr.Markdown("❌ Failed to load examples. Please upload your own screenshot.") # Load model automatically on startup def load_model_on_startup(): """Load model automatically without user feedback""" if not demo_instance.is_loaded: logger.info("Loading L-Operator model automatically...") try: demo_instance.load_model() logger.info("Model loaded successfully in background") except Exception as e: logger.error(f"Failed to load model: {str(e)}") # Load model automatically on page load demo.load(fn=load_model_on_startup) gr.Markdown(""" --- ## 📊 Model Details | Property | Value | |----------|-------| | **Base Model** | LiquidAI/LFM2-VL-1.6B | | **Architecture** | LFM2-VL (1.6B parameters) | | **Fine-tuning** | LoRA (Low-Rank Adaptation) | | **Training Data** | Android control episodes with screenshots and actions | ## 🎯 Use Cases - **Mobile App Testing**: Automated UI testing for Android applications - **Accessibility Applications**: Voice-controlled device navigation - **Remote Support**: Remote device troubleshooting - **Development Workflows**: UI/UX testing automation --- **Made with ❤️ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) """) return demo # Create and launch the demo with optimized settings if __name__ == "__main__": try: logger.info("Creating Gradio demo interface...") demo = create_demo() logger.info("Launching Gradio server...") demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=False, # Disable debug to reduce startup time show_error=True, ssr_mode=False, max_threads=2, # Limit threads to prevent resource exhaustion quiet=True # Reduce startup logging noise ) except Exception as e: logger.error(f"Failed to launch Gradio app: {str(e)}") raise