Spaces:

konkani
/

Goan-information

Running

App Files Files Community

Reubencf commited on Aug 20

Commit

b32d25e

verified ·

1 Parent(s): 6f40086

Update app.py

Browse files

Files changed (1) hide show

app.py +237 -110

app.py CHANGED Viewed

@@ -1,56 +1,95 @@
-# app.py — Optimized for Hugging Face Spaces Free Tier (CPU-only)
 import os
 import gc
 import torch
 import gradio as gr
 from typing import List, Tuple
-from peft import PeftConfig, PeftModel
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 # ── Configuration ──────────────────────────────────────────────────────────────
-HF_TOKEN = os.environ.get("HF_TOKEN")  # set in Space → Settings → Variables & secrets
-ADAPTER_ID = "Reubencf/gemma3-goan-finetuned"  # your LoRA adapter repo
-# Free tier optimization flags
-USE_8BIT = False  # Set to True if you have access to GPU tier
-MAX_MEMORY = "15GB"  # Conservative for free tier
-DEVICE = "cpu"  # Force CPU for free tier
 TITLE = "🌴 Gemma Goan Q&A Bot"
 DESCRIPTION = """
-Gemma-3-4B-Instruct base + LoRA adapter fine-tuned on a Goan Q&A dataset.
 Ask about Goa, Konkani culture, or general topics!
-**Adapter**: https://huggingface.co/Reubencf/gemma3-goan-finetuned
-⚠️ **Note**: Running on free tier (CPU). Responses may be slower. For faster inference, consider upgrading to GPU tier.
 """
-# ── Load model + tokenizer (optimized for free tier) ───────────────────────────
 def load_model_and_tokenizer():
-    """Load model with memory optimizations for free tier"""
-    print("[Init] Starting model load for free tier...")
-    # Get the base model ID from adapter config
-    peft_cfg = PeftConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN)
-    base_id = peft_cfg.base_model_name_or_path
-    print(f"[Load] Base model: {base_id}")
-    # Memory cleanup before loading
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     try:
-        # Load base model with memory optimizations
-        print("[Load] Loading base model with CPU optimizations...")
-        # Quantization config (only if GPU available and enabled)
         quantization_config = None
         if USE_8BIT and torch.cuda.is_available():
             quantization_config = BitsAndBytesConfig(
                 load_in_8bit=True,
                 bnb_8bit_compute_dtype=torch.float16
@@ -58,74 +97,125 @@ def load_model_and_tokenizer():
         # Load base model
         base_model = AutoModelForCausalLM.from_pretrained(
-            base_id,
             token=HF_TOKEN,
             trust_remote_code=True,
             quantization_config=quantization_config,
             low_cpu_mem_usage=True,
             torch_dtype=torch.float32 if DEVICE == "cpu" else torch.float16,
-            device_map=None,  # We'll move manually
-            max_memory={0: MAX_MEMORY} if torch.cuda.is_available() else None,
         )
-        # Move to device
-        if DEVICE == "cpu":
             base_model = base_model.to("cpu")
-            print("[Load] Model moved to CPU")
-        # Load and apply LoRA adapter
-        print("[Load] Loading LoRA adapter...")
-        model = PeftModel.from_pretrained(
-            base_model,
-            ADAPTER_ID,
             token=HF_TOKEN,
             trust_remote_code=True,
-            is_trainable=False,  # Inference only
         )
-        # Merge adapter with base (reduces memory overhead during inference)
-        print("[Load] Merging adapter for efficiency...")
-        model = model.merge_and_unload()
-        print("[Load] Model loaded successfully!")
     except Exception as e:
-        print(f"[Error] Failed to load model: {e}")
-        raise gr.Error(
-            f"Failed to load model. This may be due to memory constraints on free tier. "
-            f"Consider using a smaller model or upgrading to GPU tier. Error: {str(e)}"
-        )
-    # Load tokenizer
-    print("[Load] Loading tokenizer...")
-    tokenizer = AutoTokenizer.from_pretrained(
-        base_id,
-        token=HF_TOKEN,
-        use_fast=True,
-        trust_remote_code=True,
-    )
-    # Set padding token
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "left"  # Better for generation
-    # Set model to eval mode
-    model.eval()
-    # Memory cleanup
-    gc.collect()
-    return model, tokenizer, base_id
-# Load model globally (done once at startup)
 try:
-    model, tokenizer, BASE_ID = load_model_and_tokenizer()
     MODEL_LOADED = True
 except Exception as e:
     print(f"[Fatal] Could not load model: {e}")
     MODEL_LOADED = False
-    model, tokenizer, BASE_ID = None, None, None
 # ── Generation function ─────────────────────────────────────────────────────────
 def generate_response(
@@ -136,81 +226,108 @@ def generate_response(
     top_p: float = 0.95,
     repetition_penalty: float = 1.1,
 ) -> str:
-    """Generate response using the fine-tuned model"""
     if not MODEL_LOADED:
-        return "⚠️ Model failed to load. This usually happens due to memory constraints on the free tier. Please try again later or contact the space owner."
     try:
-        # Build conversation history
         conversation = []
-        for user_msg, assistant_msg in history:
-            if user_msg:
-                conversation.append({"role": "user", "content": user_msg})
-            if assistant_msg:
-                conversation.append({"role": "assistant", "content": assistant_msg})
         conversation.append({"role": "user", "content": message})
         # Apply chat template
-        prompt = tokenizer.apply_chat_template(
-            conversation,
-            add_generation_prompt=True,
-            return_tensors="pt"
-        )
-        # Move to model device
-        prompt = prompt.to(model.device)
-        # Generate with memory-efficient settings
         with torch.no_grad():
-            # Use cache for faster generation
             outputs = model.generate(
                 input_ids=prompt,
-                max_new_tokens=min(int(max_new_tokens), 256),  # Cap for free tier
                 temperature=float(temperature),
                 top_p=float(top_p),
                 repetition_penalty=float(repetition_penalty),
                 do_sample=True,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
-                use_cache=True,  # Enable KV cache
             )
-        # Decode only the generated tokens
         generated_tokens = outputs[0][prompt.shape[-1]:]
         response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
-        # Memory cleanup after generation
         del outputs, prompt, generated_tokens
         gc.collect()
         return response
-    except torch.cuda.OutOfMemoryError:
-        gc.collect()
-        torch.cuda.empty_cache()
-        return "⚠️ Out of memory. Try reducing max_new_tokens or restarting the space."
     except Exception as e:
-        return f"⚠️ Error generating response: {str(e)}"
 # ── Gradio Interface ────────────────────────────────────────────────────────────
 examples = [
     ["What is the capital of Goa?"],
     ["Tell me about Konkani language"],
-    ["What are the famous beaches in Goa?"],
     ["Describe Goan fish curry"],
     ["What is the history of Old Goa?"],
 ]
-# Create the chat interface
 if MODEL_LOADED:
     demo = gr.ChatInterface(
         fn=generate_response,
         title=TITLE,
         description=DESCRIPTION,
         examples=examples,
-        retry_btn=None,  # Disable retry to save memory
-        undo_btn=None,   # Disable undo to save memory
         additional_inputs=[
             gr.Slider(
                 minimum=0.1,
@@ -222,7 +339,7 @@ if MODEL_LOADED:
             gr.Slider(
                 minimum=32,
                 maximum=256,
-                value=128,  # Reduced default for free tier
                 step=16,
                 label="Max new tokens"
             ),
@@ -244,21 +361,31 @@ if MODEL_LOADED:
         theme=gr.themes.Soft(),
     )
 else:
-    # Fallback interface if model fails to load
     demo = gr.Interface(
-        fn=lambda x: "⚠️ Model failed to load. Please check the logs or try restarting the space.",
         inputs=gr.Textbox(label="Message"),
         outputs=gr.Textbox(label="Response"),
         title=TITLE,
-        description="**Error**: Model could not be loaded. This is likely due to memory constraints on the free tier.",
     )
-# Queue for handling multiple users
-demo.queue(
-    concurrency_count=1,  # Process one at a time to save memory
-    max_size=10,          # Reduced queue size for free tier
-)
-# Launch the app
 if __name__ == "__main__":
     demo.launch()

+# app.py — Corrected for proper LoRA adapter loading
 import os
 import gc
 import torch
 import gradio as gr
 from typing import List, Tuple
+import warnings
+warnings.filterwarnings('ignore')
+try:
+    from peft import PeftConfig, PeftModel
+    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+    IMPORTS_OK = True
+except ImportError as e:
+    IMPORTS_OK = False
+    print(f"Missing dependencies: {e}")
+    print("Please install: pip install transformers peft torch gradio accelerate")
 # ── Configuration ──────────────────────────────────────────────────────────────
+HF_TOKEN = os.environ.get("HF_TOKEN")  # Optional for public models
+# Your LoRA adapter location (HuggingFace repo or local path)
+ADAPTER_ID = "Reubencf/gemma3-goan-finetuned"
+# For local adapter: ADAPTER_ID = "./path/to/your/adapter"
+# Base model - MUST match what you used for fine-tuning!
+# Check your adapter's config.json for "base_model_name_or_path"
+BASE_MODEL_ID = "google/gemma-2b-it"  # Change this to your actual base model
+# Common options:
+# - "google/gemma-2b-it" (2B parameters, easier on memory)
+# - "unsloth/gemma-2-2b-it-bnb-4bit" (quantized version)
+# - Your actual base model used for training
+# Settings
+USE_8BIT = False  # Set to True if you have GPU and want to use 8-bit quantization
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TITLE = "🌴 Gemma Goan Q&A Bot"
 DESCRIPTION = """
+Gemma base model + LoRA adapter fine-tuned on a Goan Q&A dataset.
 Ask about Goa, Konkani culture, or general topics!
+**Status**: {}
 """
+# ── Load model + tokenizer (correct LoRA loading) ──────────────────────────────
 def load_model_and_tokenizer():
+    """Load base model and apply LoRA adapter correctly"""
+    if not IMPORTS_OK:
+        raise ImportError("Required packages not installed")
+    print("[Init] Starting model load...")
+    print(f"[Config] Base model: {BASE_MODEL_ID}")
+    print(f"[Config] LoRA adapter: {ADAPTER_ID}")
+    print(f"[Config] Device: {DEVICE}")
+    # Memory cleanup
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+    status = ""
+    model = None
+    tokenizer = None
     try:
+        # Step 1: Try to read adapter config to get the correct base model
+        actual_base_model = BASE_MODEL_ID
+        try:
+            print(f"[Load] Checking adapter configuration...")
+            peft_config = PeftConfig.from_pretrained(ADAPTER_ID, token=HF_TOKEN)
+            actual_base_model = peft_config.base_model_name_or_path
+            print(f"[Load] Adapter expects base model: {actual_base_model}")
+            # Warn if mismatch
+            if actual_base_model != BASE_MODEL_ID:
+                print(f"[Warning] BASE_MODEL_ID ({BASE_MODEL_ID}) doesn't match adapter's base ({actual_base_model})")
+                print(f"[Load] Using adapter's base model: {actual_base_model}")
+        except Exception as e:
+            print(f"[Warning] Cannot read adapter config: {e}")
+            print(f"[Load] Will try with configured base model: {BASE_MODEL_ID}")
+            actual_base_model = BASE_MODEL_ID
+        # Step 2: Load the BASE MODEL (not the adapter!)
+        print(f"[Load] Loading base model: {actual_base_model}")
+        # Quantization config for GPU
         quantization_config = None
         if USE_8BIT and torch.cuda.is_available():
+            print("[Load] Using 8-bit quantization")
             quantization_config = BitsAndBytesConfig(
                 load_in_8bit=True,
                 bnb_8bit_compute_dtype=torch.float16
         # Load base model
         base_model = AutoModelForCausalLM.from_pretrained(
+            actual_base_model,
             token=HF_TOKEN,
             trust_remote_code=True,
             quantization_config=quantization_config,
             low_cpu_mem_usage=True,
             torch_dtype=torch.float32 if DEVICE == "cpu" else torch.float16,
+            device_map="auto" if torch.cuda.is_available() else None,
         )
+        # Move to device if needed
+        if DEVICE == "cpu" and not torch.cuda.is_available():
             base_model = base_model.to("cpu")
+            print("[Load] Model on CPU")
+        print("[Load] Base model loaded successfully")
+        # Step 3: Load tokenizer from the BASE MODEL
+        print(f"[Load] Loading tokenizer from base model...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            actual_base_model,
             token=HF_TOKEN,
+            use_fast=True,
             trust_remote_code=True,
         )
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"
+        # Step 4: Try to apply LoRA adapter
+        try:
+            print(f"[Load] Applying LoRA adapter: {ADAPTER_ID}")
+            model = PeftModel.from_pretrained(
+                base_model,
+                ADAPTER_ID,
+                token=HF_TOKEN,
+                trust_remote_code=True,
+                is_trainable=False,  # Inference only
+            )
+            # Optional: Merge adapter with base model for faster inference
+            # This combines the weights permanently (uses more memory initially but faster inference)
+            merge = input("\n💡 Merge adapter for faster inference? (y/n, default=y): ").strip().lower()
+            if merge != 'n':
+                print("[Load] Merging adapter with base model...")
+                model = model.merge_and_unload()
+                print("[Load] Adapter merged successfully")
+                status = f"✅ Using fine-tuned model (merged): {ADAPTER_ID}"
+            else:
+                print("[Load] Using adapter without merging")
+                status = f"✅ Using fine-tuned model: {ADAPTER_ID}"
+        except FileNotFoundError as e:
+            print(f"[Error] Adapter files not found: {e}")
+            print("[Fallback] Using base model without fine-tuning")
+            model = base_model
+            status = f"⚠️ Adapter not found. Using base model only: {actual_base_model}"
+        except Exception as e:
+            print(f"[Error] Failed to load adapter: {e}")
+            print("[Fallback] Using base model without fine-tuning")
+            model = base_model
+            status = f"⚠️ Could not load adapter. Using base model only: {actual_base_model}"
+        # Step 5: Final setup
+        model.eval()
+        print(f"[Load] Model ready on {DEVICE}!")
+        # Memory cleanup
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return model, tokenizer, status
     except Exception as e:
+        error_msg = f"Failed to load model: {str(e)}"
+        print(f"[Fatal] {error_msg}")
+        # Try fallback to smallest model
+        if "gemma-2b" not in BASE_MODEL_ID.lower():
+            print("[Fallback] Trying with gemma-2b-it...")
+            try:
+                base_model = AutoModelForCausalLM.from_pretrained(
+                    "google/gemma-2b-it",
+                    token=HF_TOKEN,
+                    trust_remote_code=True,
+                    low_cpu_mem_usage=True,
+                    torch_dtype=torch.float32,
+                    device_map=None,
+                ).to("cpu")
+                tokenizer = AutoTokenizer.from_pretrained(
+                    "google/gemma-2b-it",
+                    token=HF_TOKEN,
+                    trust_remote_code=True,
+                )
+                if tokenizer.pad_token is None:
+                    tokenizer.pad_token = tokenizer.eos_token
+                base_model.eval()
+                return base_model, tokenizer, "⚠️ Using fallback model: gemma-2b-it (no fine-tuning)"
+            except Exception as fallback_error:
+                print(f"[Fatal] Fallback also failed: {fallback_error}")
+                raise gr.Error(f"Cannot load any model. Check your configuration.")
+        else:
+            raise gr.Error(error_msg)
+# Load model globally
 try:
+    model, tokenizer, STATUS_MSG = load_model_and_tokenizer()
     MODEL_LOADED = True
+    DESCRIPTION = DESCRIPTION.format(STATUS_MSG)
 except Exception as e:
     print(f"[Fatal] Could not load model: {e}")
     MODEL_LOADED = False
+    model, tokenizer = None, None
+    DESCRIPTION = DESCRIPTION.format(f"❌ Model failed to load: {str(e)[:100]}")
 # ── Generation function ─────────────────────────────────────────────────────────
 def generate_response(
     top_p: float = 0.95,
     repetition_penalty: float = 1.1,
 ) -> str:
+    """Generate response using the model"""
     if not MODEL_LOADED:
+        return "⚠️ Model failed to load. Please check the logs or restart the application."
     try:
+        # Build conversation
         conversation = []
+        if history:
+            # Keep last 3 exchanges for context
+            for user_msg, assistant_msg in history[-3:]:
+                if user_msg:
+                    conversation.append({"role": "user", "content": user_msg})
+                if assistant_msg:
+                    conversation.append({"role": "assistant", "content": assistant_msg})
         conversation.append({"role": "user", "content": message})
         # Apply chat template
+        try:
+            prompt = tokenizer.apply_chat_template(
+                conversation,
+                add_generation_prompt=True,
+                return_tensors="pt"
+            )
+        except Exception as e:
+            print(f"[Warning] Chat template failed: {e}, using fallback format")
+            # Fallback format
+            prompt_text = ""
+            for msg in conversation:
+                if msg["role"] == "user":
+                    prompt_text += f"User: {msg['content']}\n"
+                else:
+                    prompt_text += f"Assistant: {msg['content']}\n"
+            prompt_text += "Assistant: "
+            inputs = tokenizer(
+                prompt_text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=512
+            )
+            prompt = inputs.input_ids
+        # Move to device
+        prompt = prompt.to(model.device if hasattr(model, 'device') else DEVICE)
+        # Generate
+        print(f"[Generate] Input length: {prompt.shape[-1]} tokens")
         with torch.no_grad():
             outputs = model.generate(
                 input_ids=prompt,
+                max_new_tokens=min(int(max_new_tokens), 256),
                 temperature=float(temperature),
                 top_p=float(top_p),
                 repetition_penalty=float(repetition_penalty),
                 do_sample=True,
                 pad_token_id=tokenizer.pad_token_id,
                 eos_token_id=tokenizer.eos_token_id,
+                use_cache=True,
             )
+        # Decode only generated tokens
         generated_tokens = outputs[0][prompt.shape[-1]:]
         response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
+        print(f"[Generate] Output length: {len(generated_tokens)} tokens")
+        # Cleanup
         del outputs, prompt, generated_tokens
         gc.collect()
         return response
     except Exception as e:
+        error_msg = f"⚠️ Error generating response: {str(e)}"
+        print(f"[Error] {error_msg}")
+        # Try to recover memory
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return error_msg
 # ── Gradio Interface ────────────────────────────────────────────────────────────
 examples = [
     ["What is the capital of Goa?"],
     ["Tell me about Konkani language"],
+    ["What are famous beaches in Goa?"],
     ["Describe Goan fish curry"],
     ["What is the history of Old Goa?"],
 ]
+# Create interface
 if MODEL_LOADED:
     demo = gr.ChatInterface(
         fn=generate_response,
         title=TITLE,
         description=DESCRIPTION,
         examples=examples,
+        retry_btn=None,
+        undo_btn=None,
         additional_inputs=[
             gr.Slider(
                 minimum=0.1,
             gr.Slider(
                 minimum=32,
                 maximum=256,
+                value=128,
                 step=16,
                 label="Max new tokens"
             ),
         theme=gr.themes.Soft(),
     )
 else:
     demo = gr.Interface(
+        fn=lambda x: "Model failed to load. Check console for errors.",
         inputs=gr.Textbox(label="Message"),
         outputs=gr.Textbox(label="Response"),
         title=TITLE,
+        description=DESCRIPTION,
     )
+# Queue with version compatibility
+try:
+    # Try newer Gradio syntax first (4.x)
+    demo.queue(default_concurrency_limit=1, max_size=10)
+except TypeError:
+    try:
+        # Fall back to older syntax (3.x)
+        demo.queue(concurrency_count=1, max_size=10)
+    except:
+        # If both fail, try without parameters
+        demo.queue()
 if __name__ == "__main__":
+    print("\n" + "="*50)
+    print(f"🚀 Starting Gradio app on {DEVICE}...")
+    print(f"📍 Base model: {BASE_MODEL_ID}")
+    print(f"🔧 LoRA adapter: {ADAPTER_ID}")
+    print("="*50 + "\n")
     demo.launch()