ai_ocr / app.py
utarn's picture
Update model
dfdb180
import gradio as gr
import requests
import json
import base64
import os
from typing import List, Optional, Tuple, Any
import mimetypes
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
class OmniAPIClient:
"""Client for interacting with the Omni API"""
def __init__(self, base_url: str = "https://api.modelharbor.com"):
self.base_url = base_url.rstrip('/')
self.chat_endpoint = f"{self.base_url}/v1/chat/completions"
self.models_endpoint = f"{self.base_url}/v1/models"
def encode_file_to_base64(self, file_path: str) -> str:
"""Encode file to base64 string"""
with open(file_path, "rb") as file:
return base64.b64encode(file.read()).decode('utf-8')
def get_mime_type(self, file_path: str) -> str:
"""Get MIME type of file"""
mime_type, _ = mimetypes.guess_type(file_path)
return mime_type or "application/octet-stream"
def create_file_content(self, file_path: str, file_type: str) -> dict:
"""Create file content object based on API format"""
file_name = os.path.basename(file_path)
mime_type = self.get_mime_type(file_path)
# Check if the file is an image
if mime_type and mime_type.startswith('image/'):
# Handle images with the new format
file_data_b64 = self.encode_file_to_base64(file_path)
return {
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{file_data_b64}"
}
}
else:
# Handle other files with existing logic
file_data_b64 = self.encode_file_to_base64(file_path)
return {
"type": "file",
"file": {
"filename": file_name,
"file_data": f"data:{mime_type};base64,{file_data_b64}"
}
}
def build_message_content(self, text: str, files: List[str]) -> List[dict]:
"""Build message content with text and files"""
content_parts = []
# Add text content first
if text.strip():
content_parts.append({
"type": "text",
"text": text
})
# Add files in order
for file_path in files:
if file_path and os.path.exists(file_path):
file_content = self.create_file_content(file_path, "file")
content_parts.append(file_content)
return content_parts
def get_available_models(self, api_key: str = "") -> Tuple[bool, List[str]]:
"""Return fixed set of available models"""
# Fixed set of models as requested
fixed_models = [
"typhoon-ocr-preview",
"qwen/qwen3-vl-235b-a22b-instruct",
"openai/gpt-5",
"meta-llama/llama-4-maverick",
"gemini/gemini-2.5-pro",
"gemini/gemini-2.5-flash"
]
return True, fixed_models
def send_chat_completion(self, text: str, files: List[str], api_key: str = "", model: str = "qwen/qwen3-vl-235b-a22b-instruct", max_tokens: int = 16384, stream: bool = False) -> Tuple[bool, Any]:
"""Send chat completion request to the API"""
try:
# Build message content
content_parts = self.build_message_content(text, files)
# If no content parts, return error
if not content_parts:
return False, {"error": "No text or valid files provided"}
# Build request payload
payload = {
"model": model,
"messages": [
{
"role": "user",
"content": content_parts
}
],
"max_tokens": max_tokens,
"stream": stream
}
# Build headers
headers = {
"Content-Type": "application/json"
}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
# Send request
response = requests.post(
self.chat_endpoint,
json=payload,
headers=headers,
timeout=600
)
# Check response
if response.status_code == 200:
try:
response_data = response.json()
return True, response_data
except json.JSONDecodeError:
return False, {"error": "Invalid JSON response", "raw_response": response.text}
else:
try:
error_data = response.json()
return False, {"error": f"API Error ({response.status_code})", "details": error_data}
except json.JSONDecodeError:
return False, {"error": f"HTTP {response.status_code}", "raw_response": response.text}
except requests.exceptions.Timeout:
return False, {"error": "Request timeout"}
except requests.exceptions.ConnectionError:
return False, {"error": "Connection error"}
except Exception as e:
return False, {"error": f"Unexpected error: {str(e)}"}
def create_ui():
"""Create the Gradio UI"""
# Define available API endpoints and their corresponding models
API_ENDPOINTS = {
"https://api.modelharbor.com": [
"typhoon-ocr-preview",
"qwen/qwen3-vl-235b-a22b-instruct",
"openai/gpt-5",
"meta-llama/llama-4-maverick",
"gemini/gemini-2.5-pro",
"gemini/gemini-2.5-flash"
],
"https://api-omni.modelharbor.com": [
"qwen/qwen3-235b-a22b-instruct-2507-omni"
]
}
def fetch_models(api_endpoint):
"""Return models based on selected API endpoint"""
models = API_ENDPOINTS.get(api_endpoint, [])
default_model = models[0] if models else ""
return gr.Dropdown(choices=models, value=default_model)
def send_request(api_endpoint, model, max_tokens, text, files):
"""Handle request submission"""
try:
# Validate inputs
if not api_endpoint:
return "❌ API endpoint is required", ""
if not text.strip() and not files:
return "❌ Please provide either text or upload files", ""
# Always use API key from environment variable/secrets
api_key_to_use = os.getenv("API_KEY", "")
# Create client
client = OmniAPIClient(api_endpoint)
# Filter out None/empty files - handle various file input states
valid_files = []
if files is not None:
# Handle single file or list of files
if isinstance(files, list):
valid_files = [f.name for f in files if f is not None and hasattr(f, 'name')]
elif hasattr(files, 'name'):
# Single file object
valid_files = [files.name]
# Send request
success, response = client.send_chat_completion(
text=text,
files=valid_files,
api_key=api_key_to_use,
model=model,
max_tokens=max_tokens
)
if success:
# Format successful response
formatted_response = json.dumps(response, indent=2)
# Extract the assistant's reply if available
if "choices" in response and len(response["choices"]) > 0:
choice = response["choices"][0]
if "message" in choice and "content" in choice["message"]:
# Check if model contains 'typhoon'
if "typhoon" in model.lower():
try:
# If the text is { "natural_text": "..." }, extract the natural_text as json object
json_content = json.loads(choice["message"]["content"])
if "natural_text" in json_content:
assistant_reply = json_content["natural_text"]
else:
assistant_reply = choice["message"]["content"]
except (KeyError, TypeError):
# Fallback to content if natural_text is not available
assistant_reply = choice["message"]["content"]
else:
assistant_reply = choice["message"]["content"]
status = f"βœ… Request successful\n\n**Assistant Reply:**\n{assistant_reply}"
else:
status = "βœ… Request successful"
else:
status = "βœ… Request successful"
return status, formatted_response
else:
# Format error response
error_response = json.dumps(response, indent=2)
return f"❌ Request failed", error_response
except Exception as e:
return f"❌ Error: {str(e)}", ""
def clear_form():
"""Clear all form inputs"""
return "", "", "", None
# Custom CSS for better layout
css = """
.gradio-container {
max-width: 1200px;
}
.config-panel {
background-color: #f8f9fa;
border-radius: 8px;
padding: 15px;
margin-bottom: 20px;
}
.input-panel {
border-right: 1px solid #e0e0e0;
padding-right: 20px;
}
.output-panel {
padding-left: 20px;
}
"""
with gr.Blocks(css=css, title="Omni API Chat Interface") as interface:
gr.Markdown("# πŸ€– Omni API Chat Interface")
gr.Markdown("Interact with the Omni API using text, PDFs, images, and audio files")
# Configuration section
with gr.Group(elem_classes=["config-panel"]):
gr.Markdown("## βš™οΈ Configuration")
with gr.Row():
api_endpoint = gr.Dropdown(
label="API Endpoint",
choices=list(API_ENDPOINTS.keys()),
value="https://api.modelharbor.com"
)
with gr.Row():
with gr.Column(scale=3):
model = gr.Dropdown(
label="Model",
choices=API_ENDPOINTS["https://api.modelharbor.com"],
value="qwen/qwen3-235b-a22b-instruct-2507",
interactive=True
)
with gr.Column(scale=2):
max_tokens = gr.Number(
label="Max Tokens",
value=16384,
minimum=1,
maximum=32000
)
# Main interface
with gr.Row():
# Input panel (left side)
with gr.Column(scale=1, elem_classes=["input-panel"]):
gr.Markdown("## πŸ“ Input")
text_input = gr.Textbox(
label="Your Message",
placeholder="Type your message here...",
lines=5
)
file_upload = gr.File(
label="Upload Files",
file_count="multiple",
file_types=[
".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp",
".mp3", ".wav", ".m4a", ".flac", ".ogg"
]
)
with gr.Row():
send_btn = gr.Button("πŸš€ Send Request", variant="primary", size="lg")
clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
# Output panel (right side)
with gr.Column(scale=1, elem_classes=["output-panel"]):
gr.Markdown("## πŸ“€ Response")
status_output = gr.Textbox(
label="Status",
placeholder="Response status will appear here...",
lines=8,
max_lines=15,
interactive=False
)
response_output = gr.Code(
label="Raw Response",
language="json",
interactive=False
)
# Example section
with gr.Accordion("πŸ“š Usage Examples", open=False):
gr.Markdown("""
### Example Requests:
**Text Only:**
- Message: "Hello, how are you?"
- Files: None
**PDF Analysis:**
- Message: "Please summarize this document"
- Files: document.pdf
**Image OCR:**
- Message: "Extract text from this image"
- Files: receipt.jpg
**Audio Transcription:**
- Message: "Transcribe this audio file"
- Files: meeting.mp3
**Multi-modal:**
- Message: "Analyze these files and provide insights"
- Files: report.pdf, chart.png, recording.wav
### Supported File Types:
- **PDFs**: .pdf
- **Images**: .jpg, .jpeg, .png, .gif, .bmp, .webp
- **Audio**: .mp3, .wav, .m4a, .flac, .ogg
""")
# Event handlers
send_btn.click(
fn=send_request,
inputs=[api_endpoint, model, max_tokens, text_input, file_upload],
outputs=[status_output, response_output]
)
clear_btn.click(
fn=clear_form,
outputs=[text_input, status_output, response_output, file_upload]
)
# Allow Enter key to submit (when text input is focused)
text_input.submit(
fn=send_request,
inputs=[api_endpoint, model, max_tokens, text_input, file_upload],
outputs=[status_output, response_output]
)
# Update model list when API endpoint changes
api_endpoint.change(
fn=fetch_models,
inputs=[api_endpoint],
outputs=[model]
)
return interface
if __name__ == "__main__":
# Create and launch the interface
demo = create_ui()
# Launch with custom settings
demo.launch(
# server_name="127.0.0.1", # Use localhost instead of 0.0.0.0
# server_port=7892, # Use different port to avoid conflicts
# share=False, # Set to True to create public link
# debug=True, # Disable debug mode to reduce console errors
# show_error=True, # Show detailed error messages
# inbrowser=False, # Auto-open in browser
# prevent_thread_lock=False # Ensure proper threading
)