Spaces:
Runtime error
Runtime error
File size: 9,478 Bytes
1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a ab65ffa 1895105 e6b909a 1895105 e6b909a ab65ffa e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a ab65ffa e6b909a 1895105 e6b909a daf4097 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a 1895105 e6b909a afcd394 e6b909a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import os
import fitz # PyMuPDF
import fasttext
import requests
import json
import torch
from PIL import Image
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from IndicTransToolkit.processor import IndicProcessor
import google.generativeai as genai
import gradio as gr
# === 1. CONFIGURATION & SECRETS ===
# --- Load the Gemini API Key from Hugging Face Secrets ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
# --- Model IDs (Using the CPU-friendly TrOCR model) ---
TRANSLATION_MODEL_REPO_ID = "ai4bharat/indictrans2-indic-en-1B"
OCR_MODEL_ID = "microsoft/trocr-base-printed"
# --- Language Settings ---
LANGUAGE_TO_TRANSLATE = "mal"
# --- Hardware Settings ---
DEVICE = "cpu" # Forcing CPU for compatibility with free tier
# === 2. LOAD MODELS & CONFIGURE API ===
# --- Configure Gemini API ---
if not GEMINI_API_KEY:
print("π΄ ERROR: Gemini API key is not set in the Space Secrets.")
else:
genai.configure(api_key=GEMINI_API_KEY)
# --- Load Translation Model ---
print(f"Loading tokenizer & model: {TRANSLATION_MODEL_REPO_ID} ...")
translation_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL_REPO_ID, trust_remote_code=True)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(
TRANSLATION_MODEL_REPO_ID,
trust_remote_code=True,
torch_dtype=torch.float32 # Use float32 for CPU
).to(DEVICE)
print("β
Translation model loaded.")
ip = IndicProcessor(inference=True)
# --- Load Language Detection Model ---
print("Loading fastText language detector...")
ft_model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
lang_detect_model = fasttext.load_model(ft_model_path)
print("β
fastText loaded.")
# --- Load Standard OCR Model ---
print(f"Loading Standard OCR model: {OCR_MODEL_ID}...")
ocr_pipeline = pipeline("image-to-text", model=OCR_MODEL_ID, device=-1) # device=-1 ensures CPU
print("β
Standard OCR model loaded.")
# === 3. HELPER FUNCTIONS ===
# --- Phase 1: Text Extraction ---
def classify_image_with_gemini(image: Image.Image):
"""Uses Gemini to classify an image as a 'document' or 'diagram'."""
model = genai.GenerativeModel('gemini-2.5-flash')
prompt = "Is this image primarily a text document or an engineering/technical diagram? Answer with only 'document' or 'diagram'."
response = model.generate_content([prompt, image])
classification = response.text.strip().lower()
print(f"β
Image classified as: {classification}")
return "diagram" if "diagram" in classification else "document"
def summarize_diagram_with_gemini(image: Image.Image):
"""Uses Gemini to generate a summary of an engineering diagram."""
model = genai.GenerativeModel('gemini-2.5-flash')
prompt = "You are an engineering assistant for Kochi Metro Rail Limited (KMRL). Describe the contents of this technical diagram or engineering drawing in a concise summary. Identify key components and their apparent purpose."
response = model.generate_content([prompt, image])
print("β
Diagram summary successful.")
return response.text.strip()
def extract_text_from_image(path):
"""
Classifies an image and routes it for either OCR (if a text doc) or summarization (if a diagram).
"""
print("\n--- Starting Image Processing ---")
try:
image = Image.open(path).convert("RGB")
# Step 1: Classify the image using Gemini
image_type = classify_image_with_gemini(image)
# Step 2: Route to the correct function
if image_type == "diagram":
print("-> Image is a diagram. Summarizing with Gemini...")
return summarize_diagram_with_gemini(image)
else:
print("-> Image is a document. Extracting text with TrOCR...")
out = ocr_pipeline(image)
return out[0]["generated_text"] if out else ""
except Exception as e:
print(f"β An error occurred during image processing: {e}")
return "Error during image processing."
def extract_text_from_pdf(path):
doc = fitz.open(path)
return "".join(page.get_text("text") + "\n" for page in doc)
def read_text_from_txt(path):
with open(path, "r", encoding="utf-8") as f:
return f.read()
# --- Phase 2: Translation ---
def detect_language(text_snippet):
s = text_snippet.replace("\n", " ").strip()
if not s: return None
preds = lang_detect_model.predict(s, k=1)
return preds[0][0].split("__")[-1] if preds and preds[0] else None
def translate_chunk(chunk):
batch = ip.preprocess_batch([chunk], src_lang="mal_Mlym", tgt_lang="eng_Latn")
inputs = translation_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
with torch.no_grad():
generated_tokens = translation_model.generate(**inputs, num_beams=5, max_length=512, early_stopping=True)
decoded = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
return ip.postprocess_batch(decoded, lang=tgt_lang)[0]
# --- Phase 3: Gemini Analysis ---
def generate_structured_json(text_to_analyze):
"""Generates the detailed JSON analysis."""
model = genai.GenerativeModel('gemini-2.5-flash')
prompt = f"You are an AI assistant for KMRL. Analyze this document and extract key info as JSON: {text_to_analyze}"
json_schema = {"type": "OBJECT", "properties": {"summary": {"type": "STRING"}, "actions_required": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"action": {"type": "STRING"}, "priority": {"type": "STRING", "enum": ["High", "Medium", "Low"]}, "deadline": {"type": "STRING"}, "notes": {"type": "STRING"}}, "required": ["action", "priority", "deadline", "notes"]}}, "departments_to_notify": {"type": "ARRAY", "items": {"type": "STRING"}}, "cross_document_flags": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"related_document_type": {"type": "STRING"}, "related_issue": {"type": "STRING"}}, "required": ["related_document_type", "related_issue"]}}}, "required": ["summary", "actions_required", "departments_to_notify", "cross_document_flags"]}
generation_config = genai.types.GenerationConfig(response_mime_type="application/json", response_schema=json_schema)
response = model.generate_content(prompt, generation_config=generation_config)
return json.loads(response.text)
def check_relevance_with_gemini(summary_text):
"""Checks if the summary is relevant to KMRL."""
model = genai.GenerativeModel('gemini-2.5-flash')
prompt = f'Is this summary related to transportation, infrastructure, railways, or metro systems? Answer only "Yes" or "No".\n\nSummary: {summary_text}'
response = model.generate_content(prompt)
return "yes" in response.text.strip().lower()
# === 4. MAIN PROCESSING FUNCTION FOR GRADIO ===
def process_and_analyze_document(input_file):
if not GEMINI_API_KEY:
raise gr.Error("Gemini API key is not configured. The administrator must set it in the Space Secrets.")
if input_file is None:
raise gr.Error("No file uploaded. Please upload a document.")
try:
input_file_path = input_file.name
ext = os.path.splitext(input_file_path)[1].lower()
# --- Phase 1: Get Original Text ---
if ext == ".pdf":
original_text = extract_text_from_pdf(input_file_path)
elif ext == ".txt":
original_text = read_text_from_txt(input_file_path)
elif ext in [".png", ".jpg", ".jpeg"]:
original_text = extract_text_from_image(input_file_path)
else:
raise gr.Error("Unsupported file type.")
if not original_text or not original_text.strip():
raise gr.Error("No text could be extracted from the document.")
# --- Phase 2: Translate if Necessary ---
lines = original_text.split("\n")
translated_lines = []
for ln in lines:
if not ln.strip(): continue
lang = detect_language(ln)
if lang == LANGUAGE_TO_TRANSLATE:
translated_lines.append(translate_chunk(ln))
else:
translated_lines.append(ln)
final_text = "\n".join(translated_lines)
# --- Phase 3: Analyze with Gemini ---
summary_data = generate_structured_json(final_text)
if not summary_data or "summary" not in summary_data:
raise gr.Error("Failed to generate a valid analysis from the document.")
is_relevant = check_relevance_with_gemini(summary_data["summary"])
if is_relevant:
return summary_data
else:
return {"status": "Not Applicable", "reason": "The document was determined to be not relevant to KMRL."}
except Exception as e:
raise gr.Error(f"An unexpected error occurred: {str(e)}")
iface = gr.Interface(
fn=process_and_analyze_document,
inputs=gr.File(label="Upload Document (.pdf, .txt, .png, .jpeg)"),
outputs=gr.JSON(label="Analysis Result"),
title="KMRL Document Analysis Pipeline",
description="Upload a document (Malayalam or English). The system will detect and translate Malayalam text to English, then send the full text to Gemini for structured analysis.",
allow_flagging="never",
examples=[
["Malayalam-en.txt"] # If you upload this file to your Space
]
)
if __name__ == "__main__":
iface.launch() |