Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| import json | |
| import os | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| from typing import List, Dict, Any | |
| import logging | |
| import spaces | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Model configuration | |
| MODEL_ID = "Tonic/l-operator" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Get Hugging Face token from environment variable (Spaces secrets) | |
| import os | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| if not HF_TOKEN: | |
| logger.warning("HF_TOKEN not found in environment variables. Model access may be restricted.") | |
| logger.warning("Please set HF_TOKEN in your environment variables or Spaces secrets.") | |
| class LOperatorDemo: | |
| def __init__(self): | |
| self.model = None | |
| self.processor = None | |
| self.is_loaded = False | |
| def load_model(self): | |
| """Load the L-Operator model and processor with timeout handling""" | |
| try: | |
| import time | |
| start_time = time.time() | |
| logger.info(f"Loading model {MODEL_ID} on device {DEVICE}") | |
| # Check if token is available | |
| if not HF_TOKEN: | |
| return "β HF_TOKEN not found. Please set HF_TOKEN in Spaces secrets." | |
| # Load model with progress logging | |
| logger.info("Downloading and loading model weights...") | |
| self.model = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_ID, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32, | |
| trust_remote_code=True | |
| ) | |
| # Load processor | |
| logger.info("Loading processor...") | |
| self.processor = AutoProcessor.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True | |
| ) | |
| if DEVICE == "cpu": | |
| self.model = self.model.to(DEVICE) | |
| self.is_loaded = True | |
| load_time = time.time() - start_time | |
| logger.info(f"Model loaded successfully in {load_time:.1f} seconds") | |
| return f"β Model loaded successfully in {load_time:.1f} seconds" | |
| except Exception as e: | |
| logger.error(f"Error loading model: {str(e)}") | |
| return f"β Error loading model: {str(e)} - This may be a custom model requiring special handling" | |
| # 2 minutes for action generation | |
| def generate_action(self, image: Image.Image, goal: str, instruction: str) -> str: | |
| """Generate action based on image and text inputs""" | |
| if not self.is_loaded: | |
| return "β Model not loaded. Please load the model first." | |
| try: | |
| # Convert image to RGB if needed | |
| if image.mode != "RGB": | |
| image = image.convert("RGB") | |
| # Build conversation | |
| conversation = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": "You are a helpful multimodal assistant by Liquid AI."} | |
| ] | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": f"Goal: {goal}\nStep: {instruction}\nRespond with a JSON action containing relevant keys (e.g., action_type, x, y, text, app_name, direction)."} | |
| ] | |
| } | |
| ] | |
| logger.info("Processing conversation with processor...") | |
| # Process inputs with better error handling | |
| try: | |
| inputs = self.processor.apply_chat_template( | |
| conversation, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ) | |
| logger.info(f"Processor output type: {type(inputs)}") | |
| # Ensure inputs is a tensor and move to correct device | |
| if not isinstance(inputs, torch.Tensor): | |
| logger.warning("apply_chat_template did not return a tensor, attempting to convert...") | |
| if isinstance(inputs, (list, tuple)): | |
| inputs = torch.tensor(inputs) | |
| else: | |
| # If it's a string or other type, we need to handle it differently | |
| logger.error(f"Unexpected input type: {type(inputs)}, value: {inputs}") | |
| return "β Error: Processor returned unexpected format" | |
| inputs = inputs.to(self.model.device) | |
| logger.info(f"Inputs shape: {inputs.shape}, device: {inputs.device}") | |
| except Exception as e: | |
| logger.error(f"Error in processor: {str(e)}") | |
| return f"β Error in processor: {str(e)}" | |
| # Generate response | |
| logger.info("Generating response...") | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| inputs, | |
| max_new_tokens=128, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9 | |
| ) | |
| logger.info("Decoding response...") | |
| response = self.processor.tokenizer.decode( | |
| outputs[0][inputs.shape[1]:], | |
| skip_special_tokens=True | |
| ) | |
| # Try to parse as JSON for better formatting | |
| try: | |
| parsed_response = json.loads(response) | |
| return json.dumps(parsed_response, indent=2) | |
| except: | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error generating action: {str(e)}") | |
| return f"β Error generating action: {str(e)}" | |
| # Initialize demo | |
| demo_instance = LOperatorDemo() | |
| def process_input(image, goal): | |
| """Process the input and generate action""" | |
| if image is None: | |
| return "β Please upload an Android screenshot image." | |
| if not goal.strip(): | |
| return "β Please provide a goal." | |
| if not demo_instance.is_loaded: | |
| return "β Model not loaded. Please wait for it to load automatically." | |
| try: | |
| # Handle different image formats | |
| pil_image = None | |
| if hasattr(image, 'mode'): # PIL Image object | |
| pil_image = image | |
| elif isinstance(image, str) and os.path.exists(image): | |
| # Handle file path (from examples) | |
| pil_image = Image.open(image) | |
| elif hasattr(image, 'name') and os.path.exists(image.name): | |
| # Handle Gradio file object | |
| pil_image = Image.open(image.name) | |
| else: | |
| return "β Invalid image format. Please upload a valid image." | |
| if pil_image is None: | |
| return "β Failed to process image. Please try again." | |
| # Convert image to RGB if needed | |
| if pil_image.mode != "RGB": | |
| pil_image = pil_image.convert("RGB") | |
| # Generate action using goal as both goal and instruction | |
| response = demo_instance.generate_action(pil_image, goal, goal) | |
| return response | |
| except Exception as e: | |
| logger.error(f"Error processing input: {str(e)}") | |
| return f"β Error: {str(e)}" | |
| def load_example_episodes(): | |
| """Load example episodes using PIL to load images directly""" | |
| examples = [] | |
| try: | |
| episode_dirs = ["episode_13", "episode_53", "episode_73"] | |
| for episode_dir in episode_dirs: | |
| try: | |
| metadata_path = f"extracted_episodes_duckdb/{episode_dir}/metadata.json" | |
| image_path = f"extracted_episodes_duckdb/{episode_dir}/screenshots/screenshot_1.png" | |
| # Check if both files exist | |
| if os.path.exists(metadata_path) and os.path.exists(image_path): | |
| logger.info(f"Loading example from {episode_dir}") | |
| with open(metadata_path, "r") as f: | |
| metadata = json.load(f) | |
| # Load image directly with PIL | |
| pil_image = Image.open(image_path) | |
| episode_num = episode_dir.split('_')[1] | |
| goal_text = metadata.get('goal', f'Episode {episode_num} example') | |
| logger.info(f"Episode {episode_num} goal: {goal_text}") | |
| examples.append([ | |
| pil_image, # Use PIL Image object directly | |
| goal_text # Use the goal text from metadata | |
| ]) | |
| logger.info(f"Successfully loaded example for Episode {episode_num}") | |
| except Exception as e: | |
| logger.warning(f"Could not load example for {episode_dir}: {str(e)}") | |
| continue | |
| except Exception as e: | |
| logger.error(f"Error loading examples: {str(e)}") | |
| examples = [] | |
| logger.info(f"Loaded {len(examples)} examples using PIL") | |
| return examples | |
| # Create Gradio interface | |
| def create_demo(): | |
| """Create the Gradio demo interface using Blocks""" | |
| with gr.Blocks( | |
| title="L-Operator: Android Device Control Demo", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container { | |
| max-width: 1200px !important; | |
| } | |
| .output-container { | |
| min-height: 200px; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π€ L-Operator: Android Device Control Demo | |
| **Lightweight Multimodal Android Device Control Agent** | |
| This demo showcases the L-Operator model, a fine-tuned multimodal AI agent based on LiquidAI's LFM2-VL-1.6B model, | |
| optimized for Android device control through visual understanding and action generation. | |
| ## π How to Use | |
| 1. **Upload Screenshot**: Upload an Android device screenshot | |
| 2. **Describe Goal**: Enter what you want to accomplish | |
| 3. **Get Actions**: The model will generate JSON actions for Android device control | |
| ## π Expected Output Format | |
| The model generates JSON actions in the following format: | |
| ```json | |
| { | |
| "action_type": "tap", | |
| "x": 540, | |
| "y": 1200, | |
| "text": "Settings", | |
| "app_name": "com.android.settings", | |
| "confidence": 0.92 | |
| } | |
| ``` | |
| --- | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π± Upload Screenshot") | |
| image_input = gr.Image( | |
| label="Android Screenshot", | |
| type="pil", | |
| height=400 | |
| ) | |
| gr.Markdown("### π― Goal") | |
| goal_input = gr.Textbox( | |
| label="What would you like to accomplish?", | |
| placeholder="e.g., Open the Settings app and navigate to Display settings", | |
| lines=3 | |
| ) | |
| # Process button | |
| process_btn = gr.Button("π Generate Action", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Generated Action") | |
| output_text = gr.Textbox( | |
| label="JSON Action Output", | |
| lines=15, | |
| max_lines=20, | |
| interactive=False, | |
| elem_classes=["output-container"] | |
| ) | |
| # Connect the process button | |
| process_btn.click( | |
| fn=process_input, | |
| inputs=[image_input, goal_input], | |
| outputs=output_text | |
| ) | |
| # Load examples | |
| gr.Markdown("### π Example Episodes") | |
| try: | |
| examples = load_example_episodes() | |
| if examples: | |
| with gr.Row(): | |
| for i, (image, goal) in enumerate(examples): | |
| with gr.Column(scale=1): | |
| gr.Markdown(f"**Episode {i+1}**") | |
| example_image = gr.Image( | |
| value=image, | |
| label=f"Example {i+1}", | |
| height=200, | |
| interactive=False | |
| ) | |
| example_goal = gr.Textbox( | |
| value=goal, | |
| label="Goal", | |
| lines=2, | |
| interactive=False | |
| ) | |
| # Create a button to load this example | |
| load_example_btn = gr.Button(f"Load Example {i+1}", size="sm") | |
| load_example_btn.click( | |
| fn=lambda img, g: (img, g), | |
| inputs=[example_image, example_goal], | |
| outputs=[image_input, goal_input] | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Failed to load examples: {str(e)}") | |
| gr.Markdown("β Failed to load examples. Please upload your own screenshot.") | |
| # Load model automatically on startup | |
| def load_model_on_startup(): | |
| """Load model automatically without user feedback""" | |
| if not demo_instance.is_loaded: | |
| logger.info("Loading L-Operator model automatically...") | |
| try: | |
| demo_instance.load_model() | |
| logger.info("Model loaded successfully in background") | |
| except Exception as e: | |
| logger.error(f"Failed to load model: {str(e)}") | |
| # Load model automatically on page load | |
| demo.load(fn=load_model_on_startup) | |
| gr.Markdown(""" | |
| --- | |
| ## π Model Details | |
| | Property | Value | | |
| |----------|-------| | |
| | **Base Model** | LiquidAI/LFM2-VL-1.6B | | |
| | **Architecture** | LFM2-VL (1.6B parameters) | | |
| | **Fine-tuning** | LoRA (Low-Rank Adaptation) | | |
| | **Training Data** | Android control episodes with screenshots and actions | | |
| ## π― Use Cases | |
| - **Mobile App Testing**: Automated UI testing for Android applications | |
| - **Accessibility Applications**: Voice-controlled device navigation | |
| - **Remote Support**: Remote device troubleshooting | |
| - **Development Workflows**: UI/UX testing automation | |
| --- | |
| **Made with β€οΈ by Tonic** | [Model on Hugging Face](https://huggingface.co/Tonic/l-android-control) | |
| """) | |
| return demo | |
| # Create and launch the demo with optimized settings | |
| if __name__ == "__main__": | |
| try: | |
| logger.info("Creating Gradio demo interface...") | |
| demo = create_demo() | |
| logger.info("Launching Gradio server...") | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| debug=False, # Disable debug to reduce startup time | |
| show_error=True, | |
| ssr_mode=False, | |
| max_threads=2, # Limit threads to prevent resource exhaustion | |
| quiet=True # Reduce startup logging noise | |
| ) | |
| except Exception as e: | |
| logger.error(f"Failed to launch Gradio app: {str(e)}") | |
| raise |