import gradio as gr import torch from PIL import Image import json import os from transformers import AutoProcessor, AutoModelForImageTextToText from typing import List, Dict, Any import logging import spaces # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Model configuration MODEL_ID = "Tonic/l-android-control" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Get Hugging Face token from environment variable (Spaces secrets) import os HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.") class LOperatorDemo: def __init__(self): self.model = None self.processor = None self.is_loaded = False def load_model(self): """Load the L-Operator model and processor""" try: logger.info(f"Loading model {MODEL_ID} on device {DEVICE}") # Check if token is available if not HF_TOKEN: return "❌ HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets." try: # Try loading with standard approach self.processor = AutoProcessor.from_pretrained( MODEL_ID, trust_remote_code=True, token=HF_TOKEN ) self.model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, trust_remote_code=True, device_map="auto" if DEVICE == "cuda" else None, token=HF_TOKEN ) except Exception as e: logger.warning(f"Standard loading failed: {str(e)}") logger.info("Attempting fallback loading approach...") # Fallback: try loading with explicit model type self.processor = AutoProcessor.from_pretrained( MODEL_ID, trust_remote_code=True, token=HF_TOKEN, revision="main" ) self.model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, trust_remote_code=True, device_map="auto" if DEVICE == "cuda" else None, token=HF_TOKEN, revision="main", ignore_mismatched_sizes=True ) if DEVICE == "cpu": self.model = self.model.to(DEVICE) self.is_loaded = True logger.info("Model loaded successfully with token authentication") return "✅ Model loaded successfully with token authentication!" except Exception as e: logger.error(f"Error loading model: {str(e)}") return f"❌ Error loading model: {str(e)} - This may be a custom model requiring special handling" @spaces.GPU(duration=120) # 2 minutes for action generation def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str: """Generate action based on image and text inputs""" if not self.is_loaded: return "❌ Model not loaded. Please load the model first." try: # Convert image to RGB if needed if image.mode != "RGB": image = image.convert("RGB") # Build conversation conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."} ] }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."} ] } ] # Process inputs inputs = self.processor.apply_chat_template( conversation, add_generation_prompt=True, return_tensors="pt" ).to(self.model.device) # Generate response with torch.no_grad(): outputs = self.model.generate( inputs, max_new_tokens=128, do_sample=True, temperature=0.7, top_p=0.9 ) response = self.processor.tokenizer.decode( outputs[0][inputs.shape[1]:], skip_special_tokens=True ) # Try to parse as JSON for better formatting try: parsed_response = json.loads(response) return json.dumps(parsed_response, indent=2) except: return response except Exception as e: logger.error(f"Error generating action: {str(e)}") return f"❌ Error generating action: {str(e)}" @spaces.GPU(duration=90) # 1.5 minutes for chat responses def chat_with_model(self, message: str, history: List[Dict[str, str]], image: Image.Image = None) -> List[Dict[str, str]]: """Chat interface function for Gradio""" if not self.is_loaded: return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Model not loaded. Please load the model first."}] if image is None: return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please upload an Android screenshot image."}] try: # Extract goal and instruction from message if "Goal:" in message and "Step:" in message: # Parse structured input lines = message.split('\n') goal = "" instruction = "" for line in lines: if line.startswith("Goal:"): goal = line.replace("Goal:", "").strip() elif line.startswith("Step:"): instruction = line.replace("Step:", "").strip() if not goal or not instruction: return history + [{"role": "user", "content": message}, {"role": "assistant", "content": "❌ Please provide both Goal and Step in your message."}] else: # Treat as general instruction goal = "Complete the requested action" instruction = message # Generate action response = self.generate_action(image, goal, instruction) return history + [{"role": "user", "content": message}, {"role": "assistant", "content": response}] except Exception as e: logger.error(f"Error in chat: {str(e)}") return history + [{"role": "user", "content": message}, {"role": "assistant", "content": f"❌ Error: {str(e)}"}] # Initialize demo demo_instance = LOperatorDemo() # Auto-load the model on startup def auto_load_model(): """Auto-load the model when the application starts""" try: logger.info("Auto-loading L-Operator model on startup...") result = demo_instance.load_model() logger.info(f"Auto-load result: {result}") return result except Exception as e: logger.error(f"Error auto-loading model: {str(e)}") return f"❌ Error auto-loading model: {str(e)}" # Load model automatically (this happens during import) print("🚀 Auto-loading L-Operator model on startup...") auto_load_model() print("✅ Model loading completed!") # Load example episodes def load_example_episodes(): """Load example episodes from the extracted data""" examples = [] try: # Load episode 13 with open("extracted_episodes_duckdb/episode_13/metadata.json", "r") as f: episode_13 = json.load(f) # Load episode 53 with open("extracted_episodes_duckdb/episode_53/metadata.json", "r") as f: episode_53 = json.load(f) # Load episode 73 with open("extracted_episodes_duckdb/episode_73/metadata.json", "r") as f: episode_73 = json.load(f) # Create examples with simple identifiers examples = [ [ "extracted_episodes_duckdb/episode_13/screenshots/screenshot_1.png", "Episode 13: Navigate app interface" ], [ "extracted_episodes_duckdb/episode_53/screenshots/screenshot_1.png", "Episode 53: App interaction example" ], [ "extracted_episodes_duckdb/episode_73/screenshots/screenshot_1.png", "Episode 73: Device control task" ] ] except Exception as e: logger.error(f"Error loading examples: {str(e)}") examples = [] return examples # Create Gradio interface def create_demo(): """Create the Gradio demo interface""" with gr.Blocks( title="L-Operator: Android Device Control Demo", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px !important; } .chat-container { height: 600px; } """ ) as demo: gr.Markdown(""" # 🤖 L-Operator: Android Device Control Demo **Lightweight Multimodal Android Device Control Agent** This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, optimized for Android device control through visual understanding and action generation. ## 🚀 How to Use 1. **Model Loading**: The L-Operator model loads automatically on startup 2. **Upload Screenshot**: Upload an Android device screenshot 3. **Provide Instructions**: Enter your goal and step instructions 4. **Get Actions**: The model will generate JSON actions for Android device control ## 📋 Expected Output Format The model generates JSON actions in the following format: ```json { "action_type": "tap", "x": 540, "y": 1200, "text": "Settings", "app_name": "com.android.settings", "confidence": 0.92 } ``` --- """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🤖 Model Status") model_status = gr.Textbox( label="L-Operator Model", value="🔄 Loading model on startup...", interactive=False ) gr.Markdown("### 📱 Input") image_input = gr.Image( label="Android Screenshot", type="pil", height=400, sources=["upload"] ) gr.Markdown("### 📝 Instructions") goal_input = gr.Textbox( label="Goal", placeholder="e.g., Open the Settings app and navigate to Display settings", lines=2 ) step_input = gr.Textbox( label="Step Instruction", placeholder="e.g., Tap on the Settings app icon on the home screen", lines=2 ) generate_btn = gr.Button("🎯 Generate Action", variant="secondary") with gr.Column(scale=2): gr.Markdown("### 💬 Chat Interface") chat_interface = gr.ChatInterface( fn=demo_instance.chat_with_model, additional_inputs=[image_input], title="L-Operator Chat", description="Chat with L-Operator using screenshots and text instructions", examples=load_example_episodes(), type="messages" ) gr.Markdown("### 🎯 Action Output") action_output = gr.JSON( label="Generated Action", value={}, height=200 ) # Event handlers def on_generate_action(image, goal, step): if not image: return {"error": "Please upload an image"} if not goal or not step: return {"error": "Please provide both goal and step"} response = demo_instance.generate_action(image, goal, step) try: # Try to parse as JSON parsed = json.loads(response) return parsed except: return {"raw_response": response} # Update model status on page load def update_model_status(): if demo_instance.is_loaded: return "✅ L-Operator model loaded and ready!" else: return "❌ Model failed to load. Please check logs." generate_btn.click( fn=on_generate_action, inputs=[image_input, goal_input, step_input], outputs=action_output ) # Update model status on page load demo.load( fn=update_model_status, outputs=model_status ) # Update chat interface when image changes def update_chat_image(image): return image image_input.change( fn=update_chat_image, inputs=[image_input], outputs=[chat_interface.chatbot] ) gr.Markdown(""" --- ## 📊 Model Details | Property | Value | |----------|-------| | **Base Model** | LiquidAI/LFM2-VL-1.6B | | **Architecture** | LFM2-VL (1.6B parameters) | | **Fine-tuning** | LoRA (Low-Rank Adaptation) | | **Training Data** | Android control episodes with screenshots and actions | ## 🎯 Use Cases - **Mobile App Testing**: Automated UI testing for Android applications - **Accessibility Applications**: Voice-controlled device navigation - **Remote Support**: Remote device troubleshooting - **Development Workflows**: UI/UX testing automation --- **Made with ❤️ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) """) return demo # Create and launch the demo if __name__ == "__main__": demo = create_demo() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True, show_error=True, ssr_mode=False )