File size: 9,478 Bytes
1895105
 
 
e6b909a
 
1895105
 
 
 
 
 
e6b909a
1895105
e6b909a
 
1895105
e6b909a
 
1895105
 
e6b909a
 
1895105
 
e6b909a
 
 
 
 
 
 
1895105
e6b909a
1895105
e6b909a
 
1895105
 
e6b909a
 
 
1895105
e6b909a
1895105
 
e6b909a
 
1895105
 
e6b909a
1895105
e6b909a
 
 
 
1895105
e6b909a
 
 
 
1895105
e6b909a
ab65ffa
1895105
 
 
e6b909a
1895105
 
 
e6b909a
ab65ffa
e6b909a
1895105
e6b909a
1895105
 
 
e6b909a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1895105
 
 
 
 
 
 
 
 
e6b909a
1895105
 
 
 
 
 
 
 
 
 
 
 
e6b909a
1895105
e6b909a
1895105
e6b909a
ab65ffa
e6b909a
 
1895105
 
 
 
 
e6b909a
daf4097
e6b909a
1895105
 
 
e6b909a
 
1895105
e6b909a
 
 
1895105
e6b909a
 
 
 
 
 
 
 
 
 
 
1895105
e6b909a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afcd394
 
 
e6b909a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import os
import fitz  # PyMuPDF
import fasttext
import requests
import json
import torch
from PIL import Image
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from IndicTransToolkit.processor import IndicProcessor
import google.generativeai as genai
import gradio as gr

# === 1. CONFIGURATION & SECRETS ===
# --- Load the Gemini API Key from Hugging Face Secrets ---
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")

# --- Model IDs (Using the CPU-friendly TrOCR model) ---
TRANSLATION_MODEL_REPO_ID = "ai4bharat/indictrans2-indic-en-1B"
OCR_MODEL_ID = "microsoft/trocr-base-printed"

# --- Language Settings ---
LANGUAGE_TO_TRANSLATE = "mal"

# --- Hardware Settings ---
DEVICE = "cpu"  # Forcing CPU for compatibility with free tier

# === 2. LOAD MODELS & CONFIGURE API ===
# --- Configure Gemini API ---
if not GEMINI_API_KEY:
    print("πŸ”΄ ERROR: Gemini API key is not set in the Space Secrets.")
else:
    genai.configure(api_key=GEMINI_API_KEY)

# --- Load Translation Model ---
print(f"Loading tokenizer & model: {TRANSLATION_MODEL_REPO_ID} ...")
translation_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL_REPO_ID, trust_remote_code=True)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(
    TRANSLATION_MODEL_REPO_ID,
    trust_remote_code=True,
    torch_dtype=torch.float32  # Use float32 for CPU
).to(DEVICE)
print("βœ… Translation model loaded.")
ip = IndicProcessor(inference=True)

# --- Load Language Detection Model ---
print("Loading fastText language detector...")
ft_model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
lang_detect_model = fasttext.load_model(ft_model_path)
print("βœ… fastText loaded.")

# --- Load Standard OCR Model ---
print(f"Loading Standard OCR model: {OCR_MODEL_ID}...")
ocr_pipeline = pipeline("image-to-text", model=OCR_MODEL_ID, device=-1)  # device=-1 ensures CPU
print("βœ… Standard OCR model loaded.")


# === 3. HELPER FUNCTIONS ===

# --- Phase 1: Text Extraction ---
def classify_image_with_gemini(image: Image.Image):
    """Uses Gemini to classify an image as a 'document' or 'diagram'."""
    model = genai.GenerativeModel('gemini-2.5-flash')
    prompt = "Is this image primarily a text document or an engineering/technical diagram? Answer with only 'document' or 'diagram'."
    response = model.generate_content([prompt, image])
    classification = response.text.strip().lower()
    print(f"βœ… Image classified as: {classification}")
    return "diagram" if "diagram" in classification else "document"

def summarize_diagram_with_gemini(image: Image.Image):
    """Uses Gemini to generate a summary of an engineering diagram."""
    model = genai.GenerativeModel('gemini-2.5-flash')
    prompt = "You are an engineering assistant for Kochi Metro Rail Limited (KMRL). Describe the contents of this technical diagram or engineering drawing in a concise summary. Identify key components and their apparent purpose."
    response = model.generate_content([prompt, image])
    print("βœ… Diagram summary successful.")
    return response.text.strip()

def extract_text_from_image(path):
    """
    Classifies an image and routes it for either OCR (if a text doc) or summarization (if a diagram).
    """
    print("\n--- Starting Image Processing ---")
    try:
        image = Image.open(path).convert("RGB")
        
        # Step 1: Classify the image using Gemini
        image_type = classify_image_with_gemini(image)
        
        # Step 2: Route to the correct function
        if image_type == "diagram":
            print("-> Image is a diagram. Summarizing with Gemini...")
            return summarize_diagram_with_gemini(image)
        else:
            print("-> Image is a document. Extracting text with TrOCR...")
            out = ocr_pipeline(image)
            return out[0]["generated_text"] if out else ""
            
    except Exception as e:
        print(f"❌ An error occurred during image processing: {e}")
        return "Error during image processing."

def extract_text_from_pdf(path):
    doc = fitz.open(path)
    return "".join(page.get_text("text") + "\n" for page in doc)

def read_text_from_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

# --- Phase 2: Translation ---
def detect_language(text_snippet):
    s = text_snippet.replace("\n", " ").strip()
    if not s: return None
    preds = lang_detect_model.predict(s, k=1)
    return preds[0][0].split("__")[-1] if preds and preds[0] else None

def translate_chunk(chunk):
    batch = ip.preprocess_batch([chunk], src_lang="mal_Mlym", tgt_lang="eng_Latn")
    inputs = translation_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        generated_tokens = translation_model.generate(**inputs, num_beams=5, max_length=512, early_stopping=True)
    decoded = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return ip.postprocess_batch(decoded, lang=tgt_lang)[0]

# --- Phase 3: Gemini Analysis ---
def generate_structured_json(text_to_analyze):
    """Generates the detailed JSON analysis."""
    model = genai.GenerativeModel('gemini-2.5-flash')
    prompt = f"You are an AI assistant for KMRL. Analyze this document and extract key info as JSON: {text_to_analyze}"
    json_schema = {"type": "OBJECT", "properties": {"summary": {"type": "STRING"}, "actions_required": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"action": {"type": "STRING"}, "priority": {"type": "STRING", "enum": ["High", "Medium", "Low"]}, "deadline": {"type": "STRING"}, "notes": {"type": "STRING"}}, "required": ["action", "priority", "deadline", "notes"]}}, "departments_to_notify": {"type": "ARRAY", "items": {"type": "STRING"}}, "cross_document_flags": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"related_document_type": {"type": "STRING"}, "related_issue": {"type": "STRING"}}, "required": ["related_document_type", "related_issue"]}}}, "required": ["summary", "actions_required", "departments_to_notify", "cross_document_flags"]}
    generation_config = genai.types.GenerationConfig(response_mime_type="application/json", response_schema=json_schema)
    response = model.generate_content(prompt, generation_config=generation_config)
    return json.loads(response.text)

def check_relevance_with_gemini(summary_text):
    """Checks if the summary is relevant to KMRL."""
    model = genai.GenerativeModel('gemini-2.5-flash')
    prompt = f'Is this summary related to transportation, infrastructure, railways, or metro systems? Answer only "Yes" or "No".\n\nSummary: {summary_text}'
    response = model.generate_content(prompt)
    return "yes" in response.text.strip().lower()

# === 4. MAIN PROCESSING FUNCTION FOR GRADIO ===
def process_and_analyze_document(input_file):
    if not GEMINI_API_KEY:
        raise gr.Error("Gemini API key is not configured. The administrator must set it in the Space Secrets.")
    if input_file is None:
        raise gr.Error("No file uploaded. Please upload a document.")
    
    try:
        input_file_path = input_file.name
        ext = os.path.splitext(input_file_path)[1].lower()
        
        # --- Phase 1: Get Original Text ---
        if ext == ".pdf":
            original_text = extract_text_from_pdf(input_file_path)
        elif ext == ".txt":
            original_text = read_text_from_txt(input_file_path)
        elif ext in [".png", ".jpg", ".jpeg"]:
            original_text = extract_text_from_image(input_file_path)
        else:
            raise gr.Error("Unsupported file type.")
        
        if not original_text or not original_text.strip():
            raise gr.Error("No text could be extracted from the document.")

        # --- Phase 2: Translate if Necessary ---
        lines = original_text.split("\n")
        translated_lines = []
        for ln in lines:
            if not ln.strip(): continue
            lang = detect_language(ln)
            if lang == LANGUAGE_TO_TRANSLATE:
                translated_lines.append(translate_chunk(ln))
            else:
                translated_lines.append(ln)
        final_text = "\n".join(translated_lines)

        # --- Phase 3: Analyze with Gemini ---
        summary_data = generate_structured_json(final_text)
        if not summary_data or "summary" not in summary_data:
            raise gr.Error("Failed to generate a valid analysis from the document.")

        is_relevant = check_relevance_with_gemini(summary_data["summary"])

        if is_relevant:
            return summary_data
        else:
            return {"status": "Not Applicable", "reason": "The document was determined to be not relevant to KMRL."}

    except Exception as e:
        raise gr.Error(f"An unexpected error occurred: {str(e)}")


iface = gr.Interface(
    fn=process_and_analyze_document,
    inputs=gr.File(label="Upload Document (.pdf, .txt, .png, .jpeg)"),
    outputs=gr.JSON(label="Analysis Result"),
    title="KMRL Document Analysis Pipeline",
    description="Upload a document (Malayalam or English). The system will detect and translate Malayalam text to English, then send the full text to Gemini for structured analysis.",
    allow_flagging="never",
    examples=[
        ["Malayalam-en.txt"] # If you upload this file to your Space
    ]
)

if __name__ == "__main__":
    iface.launch()