Spaces:

utarn
/

ai_ocr

Running

App Files Files Community

ai_ocr / app.py

utarn

Fix launch

f3fa6c4 about 2 months ago

raw

history blame

15.5 kB

	import gradio as gr
	import requests
	import json
	import base64
	import os
	from typing import List, Optional, Tuple, Any
	import mimetypes

	class OmniAPIClient:
	"""Client for interacting with the Omni API"""

	def __init__(self, base_url: str = "https://api.modelharbor.com"):
	self.base_url = base_url.rstrip('/')
	self.chat_endpoint = f"{self.base_url}/v1/chat/completions"
	self.models_endpoint = f"{self.base_url}/v1/models"

	def encode_file_to_base64(self, file_path: str) -> str:
	"""Encode file to base64 string"""
	with open(file_path, "rb") as file:
	return base64.b64encode(file.read()).decode('utf-8')

	def get_mime_type(self, file_path: str) -> str:
	"""Get MIME type of file"""
	mime_type, _ = mimetypes.guess_type(file_path)
	return mime_type or "application/octet-stream"

	def create_file_content(self, file_path: str, file_type: str) -> dict:
	"""Create file content object based on API format"""
	file_name = os.path.basename(file_path)
	mime_type = self.get_mime_type(file_path)

	# Check if the file is an image
	if mime_type and mime_type.startswith('image/'):
	# Handle images with the new format
	file_data_b64 = self.encode_file_to_base64(file_path)
	return {
	"type": "image_url",
	"image_url": {
	"url": f"data:{mime_type};base64,{file_data_b64}"
	}
	}
	else:
	# Handle other files with existing logic
	file_data_b64 = self.encode_file_to_base64(file_path)
	return {
	"type": "file",
	"file": {
	"filename": file_name,
	"file_data": f"data:{mime_type};base64,{file_data_b64}"
	}
	}

	def build_message_content(self, text: str, files: List[str]) -> List[dict]:
	"""Build message content with text and files"""
	content_parts = []

	# Add text content first
	if text.strip():
	content_parts.append({
	"type": "text",
	"text": text
	})

	# Add files in order
	for file_path in files:
	if file_path and os.path.exists(file_path):
	file_content = self.create_file_content(file_path, "file")
	content_parts.append(file_content)

	return content_parts

	def get_available_models(self, api_key: str = "") -> Tuple[bool, List[str]]:
	"""Return fixed set of available models"""
	# Fixed set of models as requested
	fixed_models = [
	"typhoon-ocr-preview",
	"openai/gpt-5",
	"meta-llama/llama-4-maverick",
	"qwen/qwen3-235b-a22b-instruct-2507",
	"gemini/gemini-2.5-pro",
	"gemini/gemini-2.5-flash"
	]
	return True, fixed_models

	def send_chat_completion(self, text: str, files: List[str], api_key: str = "", model: str = "qwen/qwen3-235b-a22b-instruct-2507", max_tokens: int = 16384, stream: bool = False) -> Tuple[bool, Any]:
	"""Send chat completion request to the API"""
	try:
	# Build message content
	content_parts = self.build_message_content(text, files)

	# If no content parts, return error
	if not content_parts:
	return False, {"error": "No text or valid files provided"}

	# Build request payload
	payload = {
	"model": model,
	"messages": [
	{
	"role": "user",
	"content": content_parts
	}
	],
	"max_tokens": max_tokens,
	"stream": stream
	}

	# Build headers
	headers = {
	"Content-Type": "application/json"
	}

	if api_key:
	headers["Authorization"] = f"Bearer {api_key}"

	# Send request
	response = requests.post(
	self.chat_endpoint,
	json=payload,
	headers=headers,
	timeout=60
	)

	# Check response
	if response.status_code == 200:
	try:
	response_data = response.json()
	return True, response_data
	except json.JSONDecodeError:
	return False, {"error": "Invalid JSON response", "raw_response": response.text}
	else:
	try:
	error_data = response.json()
	return False, {"error": f"API Error ({response.status_code})", "details": error_data}
	except json.JSONDecodeError:
	return False, {"error": f"HTTP {response.status_code}", "raw_response": response.text}

	except requests.exceptions.Timeout:
	return False, {"error": "Request timeout"}
	except requests.exceptions.ConnectionError:
	return False, {"error": "Connection error"}
	except Exception as e:
	return False, {"error": f"Unexpected error: {str(e)}"}


	def create_ui():
	"""Create the Gradio UI"""

	def fetch_models(base_url, api_key):
	"""Return fixed set of available models"""
	# Fixed set of models as requested
	fixed_models = [
	"typhoon-ocr-preview",
	"openai/gpt-5",
	"meta-llama/llama-4-maverick",
	"qwen/qwen3-235b-a22b-instruct-2507",
	"gemini/gemini-2.5-pro",
	"gemini/gemini-2.5-flash"
	]
	return gr.Dropdown(choices=fixed_models, value="qwen/qwen3-235b-a22b-instruct-2507")

	def send_request(base_url, api_key, model, max_tokens, text, files):
	"""Handle request submission"""
	try:
	# Validate inputs
	if not base_url:
	return "❌ Base URL is required", ""

	if not text.strip() and not files:
	return "❌ Please provide either text or upload files", ""

	# Create client
	client = OmniAPIClient(base_url)

	# Filter out None/empty files - handle various file input states
	valid_files = []
	if files is not None:
	# Handle single file or list of files
	if isinstance(files, list):
	valid_files = [f.name for f in files if f is not None and hasattr(f, 'name')]
	elif hasattr(files, 'name'):
	# Single file object
	valid_files = [files.name]

	# Send request
	success, response = client.send_chat_completion(
	text=text,
	files=valid_files,
	api_key=api_key,
	model=model,
	max_tokens=max_tokens
	)

	if success:
	# Format successful response
	formatted_response = json.dumps(response, indent=2)

	# Extract the assistant's reply if available
	if "choices" in response and len(response["choices"]) > 0:
	choice = response["choices"][0]
	if "message" in choice and "content" in choice["message"]:
	# Check if model contains 'typhoon'
	if "typhoon" in model.lower():
	try:
	# If the text is { "natural_text": "..." }, extract the natural_text as json object
	json_content = json.loads(choice["message"]["content"])
	if "natural_text" in json_content:
	assistant_reply = json_content["natural_text"]
	else:
	assistant_reply = choice["message"]["content"]
	except (KeyError, TypeError):
	# Fallback to content if natural_text is not available
	assistant_reply = choice["message"]["content"]
	else:
	assistant_reply = choice["message"]["content"]

	status = f"✅ Request successful\n\nAssistant Reply:\n{assistant_reply}"
	else:
	status = "✅ Request successful"
	else:
	status = "✅ Request successful"

	return status, formatted_response
	else:
	# Format error response
	error_response = json.dumps(response, indent=2)
	return f"❌ Request failed", error_response

	except Exception as e:
	return f"❌ Error: {str(e)}", ""

	def clear_form():
	"""Clear all form inputs"""
	return "", "", "", None

	# Custom CSS for better layout
	css = """
	.gradio-container {
	max-width: 1200px;
	}
	.config-panel {
	background-color: #f8f9fa;
	border-radius: 8px;
	padding: 15px;
	margin-bottom: 20px;
	}
	.input-panel {
	border-right: 1px solid #e0e0e0;
	padding-right: 20px;
	}
	.output-panel {
	padding-left: 20px;
	}
	"""

	with gr.Blocks(css=css, title="Omni API Chat Interface") as interface:
	gr.Markdown("# 🤖 Omni API Chat Interface")
	gr.Markdown("Interact with the Omni API using text, PDFs, images, and audio files")

	# Configuration section
	with gr.Group(elem_classes=["config-panel"]):
	gr.Markdown("## ⚙️ Configuration")
	with gr.Row():
	base_url = gr.Textbox(
	label="API Base URL",
	value="https://api.modelharbor.com",
	placeholder="https://api.modelharbor.com"
	)
	api_key = gr.Textbox(
	label="API Key (Optional)",
	type="password",
	placeholder="Enter your API key if required"
	)

	with gr.Row():
	with gr.Column(scale=3):
	model = gr.Dropdown(
	label="Model",
	choices=[
	"typhoon-ocr-preview",
	"openai/gpt-5",
	"meta-llama/llama-4-maverick",
	"qwen/qwen3-235b-a22b-instruct-2507",
	"gemini/gemini-2.5-pro",
	"gemini/gemini-2.5-flash"
	],
	value="qwen/qwen3-235b-a22b-instruct-2507",
	interactive=True
	)
	with gr.Column(scale=2):
	max_tokens = gr.Number(
	label="Max Tokens",
	value=16384,
	minimum=1,
	maximum=32000
	)

	# Main interface
	with gr.Row():
	# Input panel (left side)
	with gr.Column(scale=1, elem_classes=["input-panel"]):
	gr.Markdown("## 📝 Input")

	text_input = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here...",
	lines=5
	)

	file_upload = gr.File(
	label="Upload Files",
	file_count="multiple",
	file_types=[
	".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp",
	".mp3", ".wav", ".m4a", ".flac", ".ogg"
	]
	)

	with gr.Row():
	send_btn = gr.Button("🚀 Send Request", variant="primary", size="lg")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")

	# Output panel (right side)
	with gr.Column(scale=1, elem_classes=["output-panel"]):
	gr.Markdown("## 📤 Response")

	status_output = gr.Textbox(
	label="Status",
	placeholder="Response status will appear here...",
	lines=8,
	max_lines=15,
	interactive=False
	)

	response_output = gr.Code(
	label="Raw Response",
	language="json",
	interactive=False
	)

	# Example section
	with gr.Accordion("📚 Usage Examples", open=False):
	gr.Markdown("""
	### Example Requests:

	Text Only:
	- Message: "Hello, how are you?"
	- Files: None

	PDF Analysis:
	- Message: "Please summarize this document"
	- Files: document.pdf

	Image OCR:
	- Message: "Extract text from this image"
	- Files: receipt.jpg

	Audio Transcription:
	- Message: "Transcribe this audio file"
	- Files: meeting.mp3

	Multi-modal:
	- Message: "Analyze these files and provide insights"
	- Files: report.pdf, chart.png, recording.wav

	### Supported File Types:
	- PDFs: .pdf
	- Images: .jpg, .jpeg, .png, .gif, .bmp, .webp
	- Audio: .mp3, .wav, .m4a, .flac, .ogg
	""")

	# Event handlers
	send_btn.click(
	fn=send_request,
	inputs=[base_url, api_key, model, max_tokens, text_input, file_upload],
	outputs=[status_output, response_output]
	)

	clear_btn.click(
	fn=clear_form,
	outputs=[text_input, status_output, response_output, file_upload]
	)


	# Allow Enter key to submit (when text input is focused)
	text_input.submit(
	fn=send_request,
	inputs=[base_url, api_key, model, max_tokens, text_input, file_upload],
	outputs=[status_output, response_output]
	)

	# Preload models when interface loads
	interface.load(
	fn=fetch_models,
	inputs=[base_url, api_key],
	outputs=[model]
	)

	return interface


	if __name__ == "__main__":
	# Create and launch the interface
	demo = create_ui()

	# Launch with custom settings
	demo.launch(
	# server_name="127.0.0.1", # Use localhost instead of 0.0.0.0
	# server_port=7892, # Use different port to avoid conflicts
	# share=False, # Set to True to create public link
	# debug=True, # Disable debug mode to reduce console errors
	# show_error=True, # Show detailed error messages
	# inbrowser=False, # Auto-open in browser
	# prevent_thread_lock=False # Ensure proper threading
	)