Spaces:

AC2513
/

gemma-demo

Running on Zero

App Files Files Community

AC2513 commited on Aug 27

Commit

1a184e0

1 Parent(s): ce78f65

Revert "added audio processing"

Browse files

This reverts commit 80d03f778e1e2fd2b3d0126abf41a75857409f18.

Files changed (3) hide show

app.py +6 -47
requirements.txt +1 -2
utils.py +6 -62

app.py CHANGED Viewed

@@ -68,16 +68,6 @@ def run(
         f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
     )
-    # Validate audio files are only used with 3n model
-    if message.get("files"):
-        audio_extensions = [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
-        has_audio = any(any(file.lower().endswith(ext) for ext in audio_extensions) for file in message["files"])
-        if has_audio and model_choice != "Gemma 3n E4B":
-            error_msg = "❌ **Audio files are only supported with the Gemma 3n E4B model.**\n\nPlease switch to the Gemma 3n E4B model to process audio files, or remove audio files to continue with the current model."
-            yield error_msg
-            return
     def try_fallback_model(original_model_choice: str):
         fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
         fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
@@ -245,26 +235,13 @@ def run(
             yield error_message
-def update_file_types(model_choice):
-    """Update allowed file types based on model selection."""
-    base_types = [".mp4", ".jpg", ".png", ".pdf"]
-    if model_choice == "Gemma 3n E4B":
-        # Add audio file types for 3n model
-        return base_types + [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
-    return base_types
-# Create a custom textbox that we can update
-custom_textbox = gr.MultimodalTextbox(
-    file_types=[".mp4", ".jpg", ".png", ".pdf"],
-    file_count="multiple",
-    autofocus=True
-)
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
-    textbox=custom_textbox,
     multimodal=True,
     additional_inputs=[
         gr.Dropdown(
@@ -291,7 +268,7 @@ demo = gr.ChatInterface(
             label="Model",
             choices=["Gemma 3 12B", "Gemma 3n E4B"],
             value="Gemma 3 12B",
-            info="Gemma 3 12B: More powerful and detailed responses, supports images, videos, and PDFs. Gemma 3n E4B: Faster processing with efficient performance, supports images, videos, PDFs, and audio files."
         ),
         gr.Slider(
             label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
@@ -316,29 +293,11 @@ demo = gr.ChatInterface(
 # Connect the dropdown to update the textbox
 with demo:
     preset_dropdown = demo.additional_inputs[0]
-    custom_textbox_input = demo.additional_inputs[1]
-    model_dropdown = demo.additional_inputs[2]
-    # Update custom prompt when preset changes
     preset_dropdown.change(
         fn=update_custom_prompt,
         inputs=[preset_dropdown],
-        outputs=[custom_textbox_input]
-    )
-    # Update file types when model changes
-    def update_textbox_file_types(model_choice):
-        allowed_types = update_file_types(model_choice)
-        return gr.MultimodalTextbox(
-            file_types=allowed_types,
-            file_count="multiple",
-            autofocus=True
-        )
-    model_dropdown.change(
-        fn=update_textbox_file_types,
-        inputs=[model_dropdown],
-        outputs=[demo.textbox]
     )
 if __name__ == "__main__":

         f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
     )
     def try_fallback_model(original_model_choice: str):
         fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
         fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
             yield error_message
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
+    textbox=gr.MultimodalTextbox(
+        file_types=[".mp4", ".jpg", ".png", ".pdf"], file_count="multiple", autofocus=True
+    ),
     multimodal=True,
     additional_inputs=[
         gr.Dropdown(
             label="Model",
             choices=["Gemma 3 12B", "Gemma 3n E4B"],
             value="Gemma 3 12B",
+            info="Gemma 3 12B: More powerful and detailed responses, but slower processing. Gemma 3n E4B: Faster processing with efficient performance for most tasks."
         ),
         gr.Slider(
             label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
 # Connect the dropdown to update the textbox
 with demo:
     preset_dropdown = demo.additional_inputs[0]
+    custom_textbox = demo.additional_inputs[1]
     preset_dropdown.change(
         fn=update_custom_prompt,
         inputs=[preset_dropdown],
+        outputs=[custom_textbox]
     )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -9,5 +9,4 @@ loguru
 python-dotenv
 opencv-python
 timm
-pymupdf
-librosa

 python-dotenv
 opencv-python
 timm
+pymupdf

utils.py CHANGED Viewed

@@ -2,15 +2,12 @@ import os
 import cv2
 import fitz
 import tempfile
-import librosa
-import numpy as np
 from PIL import Image
 from loguru import logger
 # Constants
 MAX_VIDEO_SIZE = 100 * 1024 * 1024  # 100 MB
 MAX_IMAGE_SIZE = 10 * 1024 * 1024   # 10 MB
-MAX_AUDIO_SIZE = 50 * 1024 * 1024   # 50 MB
 PRESET_PROMPTS = {
     "General Assistant": "You are a helpful AI assistant capable of analyzing images, videos, and PDF documents. Provide clear, accurate, and helpful responses to user queries.",
@@ -32,17 +29,13 @@ def check_file_size(file_path: str) -> bool:
         raise ValueError(f"File not found: {file_path}")
     file_size = os.path.getsize(file_path)
-    file_lower = file_path.lower()
-    if file_lower.endswith((".mp4", ".mov")):
         if file_size > MAX_VIDEO_SIZE:
             raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
-    elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
-        if file_size > MAX_AUDIO_SIZE:
-            raise ValueError(f"Audio file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_AUDIO_SIZE / (1024*1024):.0f}MB")
     else:
         if file_size > MAX_IMAGE_SIZE:
-            raise ValueError(f"Image/document file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
     return True
@@ -94,44 +87,6 @@ def process_video(video_path: str, max_images: int) -> list[dict]:
     return result_content
-def process_audio(audio_path: str) -> list[dict]:
-    """Process an audio file and return formatted content for the model."""
-    check_file_size(audio_path)
-    try:
-        # Load audio file
-        audio_data, sample_rate = librosa.load(audio_path, sr=None)
-        duration = len(audio_data) / sample_rate
-        # Get basic audio features
-        rms = librosa.feature.rms(y=audio_data)[0]
-        spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate)[0]
-        zero_crossings = librosa.zero_crossings(audio_data, pad=False)
-        # Calculate statistics
-        avg_rms = np.mean(rms)
-        avg_spectral_centroid = np.mean(spectral_centroids)
-        zcr_rate = np.sum(zero_crossings) / len(audio_data)
-        # Create audio analysis text
-        audio_analysis = f"""Audio Analysis:
-- Duration: {duration:.2f} seconds
-- Sample Rate: {sample_rate} Hz
-- Average RMS Energy: {avg_rms:.4f}
-- Average Spectral Centroid: {avg_spectral_centroid:.2f} Hz
-- Zero Crossing Rate: {zcr_rate:.4f}
-- File: {os.path.basename(audio_path)}"""
-        result_content = [{"type": "text", "text": audio_analysis}]
-        logger.debug(f"Processed audio file {audio_path} - Duration: {duration:.2f}s")
-        return result_content
-    except Exception as e:
-        logger.error(f"Error processing audio {audio_path}: {e}")
-        raise ValueError(f"Failed to process audio file: {str(e)}")
 def extract_pdf_text(pdf_path: str) -> str:
     """Extract text content from a PDF file."""
     check_file_size(pdf_path)
@@ -172,22 +127,14 @@ def process_user_input(message: dict, max_images: int) -> list[dict]:
             logger.error(f"File size check failed: {e}")
             result_content.append({"type": "text", "text": f"Error: {str(e)}"})
             continue
-        file_lower = file_path.lower()
-        if file_lower.endswith((".mp4", ".mov")):
             try:
                 result_content = [*result_content, *process_video(file_path, max_images)]
             except Exception as e:
                 logger.error(f"Video processing failed: {e}")
                 result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
-        elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
-            try:
-                result_content = [*result_content, *process_audio(file_path)]
-            except Exception as e:
-                logger.error(f"Audio processing failed: {e}")
-                result_content.append({"type": "text", "text": f"Error processing audio: {str(e)}"})
-        elif file_lower.endswith(".pdf"):
             try:
                 logger.info(f"Processing PDF file: {file_path}")
                 pdf_text = extract_pdf_text(file_path)
@@ -228,12 +175,9 @@ def process_history(history: list[dict]) -> list[dict]:
                 content_buffer.append({"type": "text", "text": content})
             elif isinstance(content, tuple) and len(content) > 0:
                 file_path = content[0]
-                file_lower = file_path.lower()
-                if file_lower.endswith((".mp4", ".mov")):
                     content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
-                elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
-                    content_buffer.append({"type": "text", "text": "[Audio uploaded previously]"})
-                elif file_lower.endswith(".pdf"):
                     content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
                 else:
                     content_buffer.append({"type": "image", "url": file_path})

 import cv2
 import fitz
 import tempfile
 from PIL import Image
 from loguru import logger
 # Constants
 MAX_VIDEO_SIZE = 100 * 1024 * 1024  # 100 MB
 MAX_IMAGE_SIZE = 10 * 1024 * 1024   # 10 MB
 PRESET_PROMPTS = {
     "General Assistant": "You are a helpful AI assistant capable of analyzing images, videos, and PDF documents. Provide clear, accurate, and helpful responses to user queries.",
         raise ValueError(f"File not found: {file_path}")
     file_size = os.path.getsize(file_path)
+    if file_path.lower().endswith((".mp4", ".mov")):
         if file_size > MAX_VIDEO_SIZE:
             raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
     else:
         if file_size > MAX_IMAGE_SIZE:
+            raise ValueError(f"Image file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
     return True
     return result_content
 def extract_pdf_text(pdf_path: str) -> str:
     """Extract text content from a PDF file."""
     check_file_size(pdf_path)
             logger.error(f"File size check failed: {e}")
             result_content.append({"type": "text", "text": f"Error: {str(e)}"})
             continue
+        if file_path.endswith((".mp4", ".mov")):
             try:
                 result_content = [*result_content, *process_video(file_path, max_images)]
             except Exception as e:
                 logger.error(f"Video processing failed: {e}")
                 result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
+        elif file_path.lower().endswith(".pdf"):
             try:
                 logger.info(f"Processing PDF file: {file_path}")
                 pdf_text = extract_pdf_text(file_path)
                 content_buffer.append({"type": "text", "text": content})
             elif isinstance(content, tuple) and len(content) > 0:
                 file_path = content[0]
+                if file_path.endswith((".mp4", ".mov")):
                     content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
+                elif file_path.lower().endswith(".pdf"):
                     content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
                 else:
                     content_buffer.append({"type": "image", "url": file_path})