Spaces:

utarn
/

ai_ocr

Sleeping

File size: 15,523 Bytes

import gradio as gr
import requests
import json
import base64
import os
from typing import List, Optional, Tuple, Any
import mimetypes
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

class OmniAPIClient:
    """Client for interacting with the Omni API"""
    
    def __init__(self, base_url: str = "https://api.modelharbor.com"):
        self.base_url = base_url.rstrip('/')
        self.chat_endpoint = f"{self.base_url}/v1/chat/completions"
        self.models_endpoint = f"{self.base_url}/v1/models"
    
    def encode_file_to_base64(self, file_path: str) -> str:
        """Encode file to base64 string"""
        with open(file_path, "rb") as file:
            return base64.b64encode(file.read()).decode('utf-8')
    
    def get_mime_type(self, file_path: str) -> str:
        """Get MIME type of file"""
        mime_type, _ = mimetypes.guess_type(file_path)
        return mime_type or "application/octet-stream"
    
    def create_file_content(self, file_path: str, file_type: str) -> dict:
        """Create file content object based on API format"""
        file_name = os.path.basename(file_path)
        mime_type = self.get_mime_type(file_path)
        
        # Check if the file is an image
        if mime_type and mime_type.startswith('image/'):
            # Handle images with the new format
            file_data_b64 = self.encode_file_to_base64(file_path)
            return {
                "type": "image_url",
                "image_url": {
                    "url": f"data:{mime_type};base64,{file_data_b64}"
                }
            }
        else:
            # Handle other files with existing logic
            file_data_b64 = self.encode_file_to_base64(file_path)
            return {
                "type": "file",
                "file": {
                    "filename": file_name,
                    "file_data": f"data:{mime_type};base64,{file_data_b64}"
                }
            }
    
    def build_message_content(self, text: str, files: List[str]) -> List[dict]:
        """Build message content with text and files"""
        content_parts = []
        
        # Add text content first
        if text.strip():
            content_parts.append({
                "type": "text",
                "text": text
            })
        
        # Add files in order
        for file_path in files:
            if file_path and os.path.exists(file_path):
                file_content = self.create_file_content(file_path, "file")
                content_parts.append(file_content)
        
        return content_parts
    
    def get_available_models(self, api_key: str = "") -> Tuple[bool, List[str]]:
        """Return fixed set of available models"""
        # Fixed set of models as requested
        fixed_models = [
            "typhoon-ocr-preview",
            "qwen/qwen3-vl-235b-a22b-instruct",
            "openai/gpt-5",
            "meta-llama/llama-4-maverick",
            "gemini/gemini-2.5-pro",
            "gemini/gemini-2.5-flash"
        ]
        return True, fixed_models

    def send_chat_completion(self, text: str, files: List[str], api_key: str = "", model: str = "qwen/qwen3-vl-235b-a22b-instruct", max_tokens: int = 16384, stream: bool = False) -> Tuple[bool, Any]:
        """Send chat completion request to the API"""
        try:
            # Build message content
            content_parts = self.build_message_content(text, files)
            
            # If no content parts, return error
            if not content_parts:
                return False, {"error": "No text or valid files provided"}
            
            # Build request payload
            payload = {
                "model": model,
                "messages": [
                    {
                        "role": "user",
                        "content": content_parts
                    }
                ],
                "max_tokens": max_tokens,
                "stream": stream
            }
            
            # Build headers
            headers = {
                "Content-Type": "application/json"
            }
            
            if api_key:
                headers["Authorization"] = f"Bearer {api_key}"
            
            # Send request
            response = requests.post(
                self.chat_endpoint,
                json=payload,
                headers=headers,
                timeout=600
            )
            
            # Check response
            if response.status_code == 200:
                try:
                    response_data = response.json()
                    return True, response_data
                except json.JSONDecodeError:
                    return False, {"error": "Invalid JSON response", "raw_response": response.text}
            else:
                try:
                    error_data = response.json()
                    return False, {"error": f"API Error ({response.status_code})", "details": error_data}
                except json.JSONDecodeError:
                    return False, {"error": f"HTTP {response.status_code}", "raw_response": response.text}
                    
        except requests.exceptions.Timeout:
            return False, {"error": "Request timeout"}
        except requests.exceptions.ConnectionError:
            return False, {"error": "Connection error"}
        except Exception as e:
            return False, {"error": f"Unexpected error: {str(e)}"}


def create_ui():
    """Create the Gradio UI"""
    
    # Define available API endpoints and their corresponding models
    API_ENDPOINTS = {
        "https://api.modelharbor.com": [
            "typhoon-ocr-preview",
            "qwen/qwen3-vl-235b-a22b-instruct",
            "openai/gpt-5",
            "meta-llama/llama-4-maverick",
            "gemini/gemini-2.5-pro",
            "gemini/gemini-2.5-flash"
        ],
        "https://api-omni.modelharbor.com": [
            "qwen/qwen3-235b-a22b-instruct-2507-omni"
        ]
    }
    
    def fetch_models(api_endpoint):
        """Return models based on selected API endpoint"""
        models = API_ENDPOINTS.get(api_endpoint, [])
        default_model = models[0] if models else ""
        return gr.Dropdown(choices=models, value=default_model)
    
    def send_request(api_endpoint, model, max_tokens, text, files):
        """Handle request submission"""
        try:
            # Validate inputs
            if not api_endpoint:
                return "❌ API endpoint is required", ""
            
            if not text.strip() and not files:
                return "❌ Please provide either text or upload files", ""
            
            # Always use API key from environment variable/secrets
            api_key_to_use = os.getenv("API_KEY", "")
            
            # Create client
            client = OmniAPIClient(api_endpoint)
            
            # Filter out None/empty files - handle various file input states
            valid_files = []
            if files is not None:
                # Handle single file or list of files
                if isinstance(files, list):
                    valid_files = [f.name for f in files if f is not None and hasattr(f, 'name')]
                elif hasattr(files, 'name'):
                    # Single file object
                    valid_files = [files.name]
            
            # Send request
            success, response = client.send_chat_completion(
                text=text,
                files=valid_files,
                api_key=api_key_to_use,
                model=model,
                max_tokens=max_tokens
            )
            
            if success:
                # Format successful response
                formatted_response = json.dumps(response, indent=2)
                
                # Extract the assistant's reply if available
                if "choices" in response and len(response["choices"]) > 0:
                    choice = response["choices"][0]
                    if "message" in choice and "content" in choice["message"]:
                        # Check if model contains 'typhoon'
                        if "typhoon" in model.lower():
                            try:
                                # If the text is { "natural_text": "..." }, extract the natural_text as json object
                                json_content = json.loads(choice["message"]["content"])
                                if "natural_text" in json_content:
                                    assistant_reply = json_content["natural_text"]
                                else:
                                    assistant_reply = choice["message"]["content"]
                            except (KeyError, TypeError):
                                # Fallback to content if natural_text is not available
                                assistant_reply = choice["message"]["content"]
                        else:
                            assistant_reply = choice["message"]["content"]
                        
                        status = f"✅ Request successful\n\n**Assistant Reply:**\n{assistant_reply}"
                    else:
                        status = "✅ Request successful"
                else:
                    status = "✅ Request successful"
                
                return status, formatted_response
            else:
                # Format error response
                error_response = json.dumps(response, indent=2)
                return f"❌ Request failed", error_response
                
        except Exception as e:
            return f"❌ Error: {str(e)}", ""
    
    def clear_form():
        """Clear all form inputs"""
        return "", "", "", None
    
    # Custom CSS for better layout
    css = """
    .gradio-container {
        max-width: 1200px;
    }
    .config-panel {
        background-color: #f8f9fa;
        border-radius: 8px;
        padding: 15px;
        margin-bottom: 20px;
    }
    .input-panel {
        border-right: 1px solid #e0e0e0;
        padding-right: 20px;
    }
    .output-panel {
        padding-left: 20px;
    }
    """
    
    with gr.Blocks(css=css, title="Omni API Chat Interface") as interface:
       gr.Markdown("# 🤖 Omni API Chat Interface")
       gr.Markdown("Interact with the Omni API using text, PDFs, images, and audio files")
       
       # Configuration section
       with gr.Group(elem_classes=["config-panel"]):
           gr.Markdown("## ⚙️ Configuration")
           with gr.Row():
               api_endpoint = gr.Dropdown(
                   label="API Endpoint",
                   choices=list(API_ENDPOINTS.keys()),
                   value="https://api.modelharbor.com"
               )
           
           with gr.Row():
               with gr.Column(scale=3):
                   model = gr.Dropdown(
                       label="Model",
                       choices=API_ENDPOINTS["https://api.modelharbor.com"],
                       value="qwen/qwen3-235b-a22b-instruct-2507",
                       interactive=True
                   )
               with gr.Column(scale=2):
                   max_tokens = gr.Number(
                       label="Max Tokens",
                       value=16384,
                       minimum=1,
                       maximum=32000
                   )
       
       # Main interface
       with gr.Row():
           # Input panel (left side)
           with gr.Column(scale=1, elem_classes=["input-panel"]):
               gr.Markdown("## 📝 Input")
               
               text_input = gr.Textbox(
                   label="Your Message",
                   placeholder="Type your message here...",
                   lines=5
               )
               
               file_upload = gr.File(
                   label="Upload Files",
                   file_count="multiple",
                   file_types=[
                       ".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp",
                       ".mp3", ".wav", ".m4a", ".flac", ".ogg"
                   ]
               )
               
               with gr.Row():
                   send_btn = gr.Button("🚀 Send Request", variant="primary", size="lg")
                   clear_btn = gr.Button("🗑️ Clear", variant="secondary")
            
           # Output panel (right side)
           with gr.Column(scale=1, elem_classes=["output-panel"]):
               gr.Markdown("## 📤 Response")
               
               status_output = gr.Textbox(
                   label="Status",
                   placeholder="Response status will appear here...",
                   lines=8,
                   max_lines=15,
                   interactive=False
               )
               
               response_output = gr.Code(
                   label="Raw Response",
                   language="json",
                   interactive=False
               )
        
       # Example section
       with gr.Accordion("📚 Usage Examples", open=False):
           gr.Markdown("""
           ### Example Requests:
           
           **Text Only:**
           - Message: "Hello, how are you?"
           - Files: None
           
           **PDF Analysis:**
           - Message: "Please summarize this document"
           - Files: document.pdf
           
           **Image OCR:**
           - Message: "Extract text from this image"
           - Files: receipt.jpg
           
           **Audio Transcription:**
           - Message: "Transcribe this audio file"
           - Files: meeting.mp3
           
           **Multi-modal:**
           - Message: "Analyze these files and provide insights"
           - Files: report.pdf, chart.png, recording.wav
           
           ### Supported File Types:
           - **PDFs**: .pdf
           - **Images**: .jpg, .jpeg, .png, .gif, .bmp, .webp
           - **Audio**: .mp3, .wav, .m4a, .flac, .ogg
           """)
        
       # Event handlers
       send_btn.click(
           fn=send_request,
           inputs=[api_endpoint, model, max_tokens, text_input, file_upload],
           outputs=[status_output, response_output]
       )
        
       clear_btn.click(
           fn=clear_form,
           outputs=[text_input, status_output, response_output, file_upload]
       )
        
        
       # Allow Enter key to submit (when text input is focused)
       text_input.submit(
           fn=send_request,
           inputs=[api_endpoint, model, max_tokens, text_input, file_upload],
           outputs=[status_output, response_output]
       )
        
       # Update model list when API endpoint changes
       api_endpoint.change(
           fn=fetch_models,
           inputs=[api_endpoint],
           outputs=[model]
       )
    
    return interface


if __name__ == "__main__":
    # Create and launch the interface
    demo = create_ui()
    
    # Launch with custom settings
    demo.launch(
        # server_name="127.0.0.1",  # Use localhost instead of 0.0.0.0
        # server_port=7892,         # Use different port to avoid conflicts
        # share=False,              # Set to True to create public link
        # debug=True,              # Disable debug mode to reduce console errors
        # show_error=True,          # Show detailed error messages
        # inbrowser=False,           # Auto-open in browser
        # prevent_thread_lock=False # Ensure proper threading
    )