Spaces:

utarn
/

ai_ocr

Running

App Files Files Community

ai_ocr / app.py

utarn

Update model

dfdb180 about 1 month ago

raw

history blame contribute delete

15.5 kB

	import gradio as gr
	import requests
	import json
	import base64
	import os
	from typing import List, Optional, Tuple, Any
	import mimetypes
	from dotenv import load_dotenv

	# Load environment variables from .env file
	load_dotenv()

	class OmniAPIClient:
	"""Client for interacting with the Omni API"""

	def __init__(self, base_url: str = "https://api.modelharbor.com"):
	self.base_url = base_url.rstrip('/')
	self.chat_endpoint = f"{self.base_url}/v1/chat/completions"
	self.models_endpoint = f"{self.base_url}/v1/models"

	def encode_file_to_base64(self, file_path: str) -> str:
	"""Encode file to base64 string"""
	with open(file_path, "rb") as file:
	return base64.b64encode(file.read()).decode('utf-8')

	def get_mime_type(self, file_path: str) -> str:
	"""Get MIME type of file"""
	mime_type, _ = mimetypes.guess_type(file_path)
	return mime_type or "application/octet-stream"

	def create_file_content(self, file_path: str, file_type: str) -> dict:
	"""Create file content object based on API format"""
	file_name = os.path.basename(file_path)
	mime_type = self.get_mime_type(file_path)

	# Check if the file is an image
	if mime_type and mime_type.startswith('image/'):
	# Handle images with the new format
	file_data_b64 = self.encode_file_to_base64(file_path)
	return {
	"type": "image_url",
	"image_url": {
	"url": f"data:{mime_type};base64,{file_data_b64}"
	}
	}
	else:
	# Handle other files with existing logic
	file_data_b64 = self.encode_file_to_base64(file_path)
	return {
	"type": "file",
	"file": {
	"filename": file_name,
	"file_data": f"data:{mime_type};base64,{file_data_b64}"
	}
	}

	def build_message_content(self, text: str, files: List[str]) -> List[dict]:
	"""Build message content with text and files"""
	content_parts = []

	# Add text content first
	if text.strip():
	content_parts.append({
	"type": "text",
	"text": text
	})

	# Add files in order
	for file_path in files:
	if file_path and os.path.exists(file_path):
	file_content = self.create_file_content(file_path, "file")
	content_parts.append(file_content)

	return content_parts

	def get_available_models(self, api_key: str = "") -> Tuple[bool, List[str]]:
	"""Return fixed set of available models"""
	# Fixed set of models as requested
	fixed_models = [
	"typhoon-ocr-preview",
	"qwen/qwen3-vl-235b-a22b-instruct",
	"openai/gpt-5",
	"meta-llama/llama-4-maverick",
	"gemini/gemini-2.5-pro",
	"gemini/gemini-2.5-flash"
	]
	return True, fixed_models

	def send_chat_completion(self, text: str, files: List[str], api_key: str = "", model: str = "qwen/qwen3-vl-235b-a22b-instruct", max_tokens: int = 16384, stream: bool = False) -> Tuple[bool, Any]:
	"""Send chat completion request to the API"""
	try:
	# Build message content
	content_parts = self.build_message_content(text, files)

	# If no content parts, return error
	if not content_parts:
	return False, {"error": "No text or valid files provided"}

	# Build request payload
	payload = {
	"model": model,
	"messages": [
	{
	"role": "user",
	"content": content_parts
	}
	],
	"max_tokens": max_tokens,
	"stream": stream
	}

	# Build headers
	headers = {
	"Content-Type": "application/json"
	}

	if api_key:
	headers["Authorization"] = f"Bearer {api_key}"

	# Send request
	response = requests.post(
	self.chat_endpoint,
	json=payload,
	headers=headers,
	timeout=600
	)

	# Check response
	if response.status_code == 200:
	try:
	response_data = response.json()
	return True, response_data
	except json.JSONDecodeError:
	return False, {"error": "Invalid JSON response", "raw_response": response.text}
	else:
	try:
	error_data = response.json()
	return False, {"error": f"API Error ({response.status_code})", "details": error_data}
	except json.JSONDecodeError:
	return False, {"error": f"HTTP {response.status_code}", "raw_response": response.text}

	except requests.exceptions.Timeout:
	return False, {"error": "Request timeout"}
	except requests.exceptions.ConnectionError:
	return False, {"error": "Connection error"}
	except Exception as e:
	return False, {"error": f"Unexpected error: {str(e)}"}


	def create_ui():
	"""Create the Gradio UI"""

	# Define available API endpoints and their corresponding models
	API_ENDPOINTS = {
	"https://api.modelharbor.com": [
	"typhoon-ocr-preview",
	"qwen/qwen3-vl-235b-a22b-instruct",
	"openai/gpt-5",
	"meta-llama/llama-4-maverick",
	"gemini/gemini-2.5-pro",
	"gemini/gemini-2.5-flash"
	],
	"https://api-omni.modelharbor.com": [
	"qwen/qwen3-235b-a22b-instruct-2507-omni"
	]
	}

	def fetch_models(api_endpoint):
	"""Return models based on selected API endpoint"""
	models = API_ENDPOINTS.get(api_endpoint, [])
	default_model = models[0] if models else ""
	return gr.Dropdown(choices=models, value=default_model)

	def send_request(api_endpoint, model, max_tokens, text, files):
	"""Handle request submission"""
	try:
	# Validate inputs
	if not api_endpoint:
	return "❌ API endpoint is required", ""

	if not text.strip() and not files:
	return "❌ Please provide either text or upload files", ""

	# Always use API key from environment variable/secrets
	api_key_to_use = os.getenv("API_KEY", "")

	# Create client
	client = OmniAPIClient(api_endpoint)

	# Filter out None/empty files - handle various file input states
	valid_files = []
	if files is not None:
	# Handle single file or list of files
	if isinstance(files, list):
	valid_files = [f.name for f in files if f is not None and hasattr(f, 'name')]
	elif hasattr(files, 'name'):
	# Single file object
	valid_files = [files.name]

	# Send request
	success, response = client.send_chat_completion(
	text=text,
	files=valid_files,
	api_key=api_key_to_use,
	model=model,
	max_tokens=max_tokens
	)

	if success:
	# Format successful response
	formatted_response = json.dumps(response, indent=2)

	# Extract the assistant's reply if available
	if "choices" in response and len(response["choices"]) > 0:
	choice = response["choices"][0]
	if "message" in choice and "content" in choice["message"]:
	# Check if model contains 'typhoon'
	if "typhoon" in model.lower():
	try:
	# If the text is { "natural_text": "..." }, extract the natural_text as json object
	json_content = json.loads(choice["message"]["content"])
	if "natural_text" in json_content:
	assistant_reply = json_content["natural_text"]
	else:
	assistant_reply = choice["message"]["content"]
	except (KeyError, TypeError):
	# Fallback to content if natural_text is not available
	assistant_reply = choice["message"]["content"]
	else:
	assistant_reply = choice["message"]["content"]

	status = f"✅ Request successful\n\nAssistant Reply:\n{assistant_reply}"
	else:
	status = "✅ Request successful"
	else:
	status = "✅ Request successful"

	return status, formatted_response
	else:
	# Format error response
	error_response = json.dumps(response, indent=2)
	return f"❌ Request failed", error_response

	except Exception as e:
	return f"❌ Error: {str(e)}", ""

	def clear_form():
	"""Clear all form inputs"""
	return "", "", "", None

	# Custom CSS for better layout
	css = """
	.gradio-container {
	max-width: 1200px;
	}
	.config-panel {
	background-color: #f8f9fa;
	border-radius: 8px;
	padding: 15px;
	margin-bottom: 20px;
	}
	.input-panel {
	border-right: 1px solid #e0e0e0;
	padding-right: 20px;
	}
	.output-panel {
	padding-left: 20px;
	}
	"""

	with gr.Blocks(css=css, title="Omni API Chat Interface") as interface:
	gr.Markdown("# 🤖 Omni API Chat Interface")
	gr.Markdown("Interact with the Omni API using text, PDFs, images, and audio files")

	# Configuration section
	with gr.Group(elem_classes=["config-panel"]):
	gr.Markdown("## ⚙️ Configuration")
	with gr.Row():
	api_endpoint = gr.Dropdown(
	label="API Endpoint",
	choices=list(API_ENDPOINTS.keys()),
	value="https://api.modelharbor.com"
	)

	with gr.Row():
	with gr.Column(scale=3):
	model = gr.Dropdown(
	label="Model",
	choices=API_ENDPOINTS["https://api.modelharbor.com"],
	value="qwen/qwen3-235b-a22b-instruct-2507",
	interactive=True
	)
	with gr.Column(scale=2):
	max_tokens = gr.Number(
	label="Max Tokens",
	value=16384,
	minimum=1,
	maximum=32000
	)

	# Main interface
	with gr.Row():
	# Input panel (left side)
	with gr.Column(scale=1, elem_classes=["input-panel"]):
	gr.Markdown("## 📝 Input")

	text_input = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here...",
	lines=5
	)

	file_upload = gr.File(
	label="Upload Files",
	file_count="multiple",
	file_types=[
	".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp",
	".mp3", ".wav", ".m4a", ".flac", ".ogg"
	]
	)

	with gr.Row():
	send_btn = gr.Button("🚀 Send Request", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")

	# Output panel (right side)
	with gr.Column(scale=1, elem_classes=["output-panel"]):
	gr.Markdown("## 📤 Response")

	status_output = gr.Textbox(
	label="Status",
	placeholder="Response status will appear here...",
	lines=8,
	max_lines=15,
	interactive=False
	)

	response_output = gr.Code(
	label="Raw Response",
	language="json",
	interactive=False
	)

	# Example section
	with gr.Accordion("📚 Usage Examples", open=False):
	gr.Markdown("""
	### Example Requests:

	Text Only:
	- Message: "Hello, how are you?"
	- Files: None

	PDF Analysis:
	- Message: "Please summarize this document"
	- Files: document.pdf

	Image OCR:
	- Message: "Extract text from this image"
	- Files: receipt.jpg

	Audio Transcription:
	- Message: "Transcribe this audio file"
	- Files: meeting.mp3

	Multi-modal:
	- Message: "Analyze these files and provide insights"
	- Files: report.pdf, chart.png, recording.wav

	### Supported File Types:
	- PDFs: .pdf
	- Images: .jpg, .jpeg, .png, .gif, .bmp, .webp
	- Audio: .mp3, .wav, .m4a, .flac, .ogg
	""")

	# Event handlers
	send_btn.click(
	fn=send_request,
	inputs=[api_endpoint, model, max_tokens, text_input, file_upload],
	outputs=[status_output, response_output]
	)

	clear_btn.click(
	fn=clear_form,
	outputs=[text_input, status_output, response_output, file_upload]
	)


	# Allow Enter key to submit (when text input is focused)
	text_input.submit(
	fn=send_request,
	inputs=[api_endpoint, model, max_tokens, text_input, file_upload],
	outputs=[status_output, response_output]
	)

	# Update model list when API endpoint changes
	api_endpoint.change(
	fn=fetch_models,
	inputs=[api_endpoint],
	outputs=[model]
	)

	return interface


	if __name__ == "__main__":
	# Create and launch the interface
	demo = create_ui()

	# Launch with custom settings
	demo.launch(
	# server_name="127.0.0.1", # Use localhost instead of 0.0.0.0
	# server_port=7892, # Use different port to avoid conflicts
	# share=False, # Set to True to create public link
	# debug=True, # Disable debug mode to reduce console errors
	# show_error=True, # Show detailed error messages
	# inbrowser=False, # Auto-open in browser
	# prevent_thread_lock=False # Ensure proper threading
	)