Spaces:

AC2513
/

gemma-demo

Running on Zero

App Files Files Community

AC2513 commited on Aug 23

Commit

80d03f7

1 Parent(s): ceb2ea0

added audio processing

Browse files

Files changed (3) hide show

app.py +47 -6
requirements.txt +2 -1
utils.py +62 -6

app.py CHANGED Viewed

@@ -68,6 +68,16 @@ def run(
         f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
     )
     def try_fallback_model(original_model_choice: str):
         fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
         fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
@@ -235,13 +245,26 @@ def run(
             yield error_message
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
-    textbox=gr.MultimodalTextbox(
-        file_types=[".mp4", ".jpg", ".png", ".pdf"], file_count="multiple", autofocus=True
-    ),
     multimodal=True,
     additional_inputs=[
         gr.Dropdown(
@@ -268,7 +291,7 @@ demo = gr.ChatInterface(
             label="Model",
             choices=["Gemma 3 12B", "Gemma 3n E4B"],
             value="Gemma 3 12B",
-            info="Gemma 3 12B: More powerful and detailed responses, but slower processing. Gemma 3n E4B: Faster processing with efficient performance for most tasks."
         ),
         gr.Slider(
             label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
@@ -293,11 +316,29 @@ demo = gr.ChatInterface(
 # Connect the dropdown to update the textbox
 with demo:
     preset_dropdown = demo.additional_inputs[0]
-    custom_textbox = demo.additional_inputs[1]
     preset_dropdown.change(
         fn=update_custom_prompt,
         inputs=[preset_dropdown],
-        outputs=[custom_textbox]
     )
 if __name__ == "__main__":

         f"system_prompt: {system_prompt} \n model_choice: {model_choice} \n max_new_tokens: {max_new_tokens} \n max_images: {max_images}"
     )
+    # Validate audio files are only used with 3n model
+    if message.get("files"):
+        audio_extensions = [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
+        has_audio = any(any(file.lower().endswith(ext) for ext in audio_extensions) for file in message["files"])
+        if has_audio and model_choice != "Gemma 3n E4B":
+            error_msg = "❌ **Audio files are only supported with the Gemma 3n E4B model.**\n\nPlease switch to the Gemma 3n E4B model to process audio files, or remove audio files to continue with the current model."
+            yield error_msg
+            return
     def try_fallback_model(original_model_choice: str):
         fallback_model = model_3n if original_model_choice == "Gemma 3 12B" else model_12
         fallback_name = "Gemma 3n E4B" if original_model_choice == "Gemma 3 12B" else "Gemma 3 12B"
             yield error_message
+def update_file_types(model_choice):
+    """Update allowed file types based on model selection."""
+    base_types = [".mp4", ".jpg", ".png", ".pdf"]
+    if model_choice == "Gemma 3n E4B":
+        # Add audio file types for 3n model
+        return base_types + [".wav", ".mp3", ".m4a", ".flac", ".ogg"]
+    return base_types
+# Create a custom textbox that we can update
+custom_textbox = gr.MultimodalTextbox(
+    file_types=[".mp4", ".jpg", ".png", ".pdf"],
+    file_count="multiple",
+    autofocus=True
+)
 demo = gr.ChatInterface(
     fn=run,
     type="messages",
     chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
+    textbox=custom_textbox,
     multimodal=True,
     additional_inputs=[
         gr.Dropdown(
             label="Model",
             choices=["Gemma 3 12B", "Gemma 3n E4B"],
             value="Gemma 3 12B",
+            info="Gemma 3 12B: More powerful and detailed responses, supports images, videos, and PDFs. Gemma 3n E4B: Faster processing with efficient performance, supports images, videos, PDFs, and audio files."
         ),
         gr.Slider(
             label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700
 # Connect the dropdown to update the textbox
 with demo:
     preset_dropdown = demo.additional_inputs[0]
+    custom_textbox_input = demo.additional_inputs[1]
+    model_dropdown = demo.additional_inputs[2]
+    # Update custom prompt when preset changes
     preset_dropdown.change(
         fn=update_custom_prompt,
         inputs=[preset_dropdown],
+        outputs=[custom_textbox_input]
+    )
+    # Update file types when model changes
+    def update_textbox_file_types(model_choice):
+        allowed_types = update_file_types(model_choice)
+        return gr.MultimodalTextbox(
+            file_types=allowed_types,
+            file_count="multiple",
+            autofocus=True
+        )
+    model_dropdown.change(
+        fn=update_textbox_file_types,
+        inputs=[model_dropdown],
+        outputs=[demo.textbox]
     )
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ loguru
 python-dotenv
 opencv-python
 timm
-pymupdf

 python-dotenv
 opencv-python
 timm
+pymupdf
+librosa

utils.py CHANGED Viewed

@@ -2,12 +2,15 @@ import os
 import cv2
 import fitz
 import tempfile
 from PIL import Image
 from loguru import logger
 # Constants
 MAX_VIDEO_SIZE = 100 * 1024 * 1024  # 100 MB
 MAX_IMAGE_SIZE = 10 * 1024 * 1024   # 10 MB
 def check_file_size(file_path: str) -> bool:
@@ -16,13 +19,17 @@ def check_file_size(file_path: str) -> bool:
         raise ValueError(f"File not found: {file_path}")
     file_size = os.path.getsize(file_path)
-    if file_path.lower().endswith((".mp4", ".mov")):
         if file_size > MAX_VIDEO_SIZE:
             raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
     else:
         if file_size > MAX_IMAGE_SIZE:
-            raise ValueError(f"Image file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
     return True
@@ -74,6 +81,44 @@ def process_video(video_path: str, max_images: int) -> list[dict]:
     return result_content
 def extract_pdf_text(pdf_path: str) -> str:
     """Extract text content from a PDF file."""
     check_file_size(pdf_path)
@@ -114,14 +159,22 @@ def process_user_input(message: dict, max_images: int) -> list[dict]:
             logger.error(f"File size check failed: {e}")
             result_content.append({"type": "text", "text": f"Error: {str(e)}"})
             continue
-        if file_path.endswith((".mp4", ".mov")):
             try:
                 result_content = [*result_content, *process_video(file_path, max_images)]
             except Exception as e:
                 logger.error(f"Video processing failed: {e}")
                 result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
-        elif file_path.lower().endswith(".pdf"):
             try:
                 logger.info(f"Processing PDF file: {file_path}")
                 pdf_text = extract_pdf_text(file_path)
@@ -162,9 +215,12 @@ def process_history(history: list[dict]) -> list[dict]:
                 content_buffer.append({"type": "text", "text": content})
             elif isinstance(content, tuple) and len(content) > 0:
                 file_path = content[0]
-                if file_path.endswith((".mp4", ".mov")):
                     content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
-                elif file_path.lower().endswith(".pdf"):
                     content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
                 else:
                     content_buffer.append({"type": "image", "url": file_path})

 import cv2
 import fitz
 import tempfile
+import librosa
+import numpy as np
 from PIL import Image
 from loguru import logger
 # Constants
 MAX_VIDEO_SIZE = 100 * 1024 * 1024  # 100 MB
 MAX_IMAGE_SIZE = 10 * 1024 * 1024   # 10 MB
+MAX_AUDIO_SIZE = 50 * 1024 * 1024   # 50 MB
 def check_file_size(file_path: str) -> bool:
         raise ValueError(f"File not found: {file_path}")
     file_size = os.path.getsize(file_path)
+    file_lower = file_path.lower()
+    if file_lower.endswith((".mp4", ".mov")):
         if file_size > MAX_VIDEO_SIZE:
             raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
+    elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
+        if file_size > MAX_AUDIO_SIZE:
+            raise ValueError(f"Audio file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_AUDIO_SIZE / (1024*1024):.0f}MB")
     else:
         if file_size > MAX_IMAGE_SIZE:
+            raise ValueError(f"Image/document file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
     return True
     return result_content
+def process_audio(audio_path: str) -> list[dict]:
+    """Process an audio file and return formatted content for the model."""
+    check_file_size(audio_path)
+    try:
+        # Load audio file
+        audio_data, sample_rate = librosa.load(audio_path, sr=None)
+        duration = len(audio_data) / sample_rate
+        # Get basic audio features
+        rms = librosa.feature.rms(y=audio_data)[0]
+        spectral_centroids = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate)[0]
+        zero_crossings = librosa.zero_crossings(audio_data, pad=False)
+        # Calculate statistics
+        avg_rms = np.mean(rms)
+        avg_spectral_centroid = np.mean(spectral_centroids)
+        zcr_rate = np.sum(zero_crossings) / len(audio_data)
+        # Create audio analysis text
+        audio_analysis = f"""Audio Analysis:
+- Duration: {duration:.2f} seconds
+- Sample Rate: {sample_rate} Hz
+- Average RMS Energy: {avg_rms:.4f}
+- Average Spectral Centroid: {avg_spectral_centroid:.2f} Hz
+- Zero Crossing Rate: {zcr_rate:.4f}
+- File: {os.path.basename(audio_path)}"""
+        result_content = [{"type": "text", "text": audio_analysis}]
+        logger.debug(f"Processed audio file {audio_path} - Duration: {duration:.2f}s")
+        return result_content
+    except Exception as e:
+        logger.error(f"Error processing audio {audio_path}: {e}")
+        raise ValueError(f"Failed to process audio file: {str(e)}")
 def extract_pdf_text(pdf_path: str) -> str:
     """Extract text content from a PDF file."""
     check_file_size(pdf_path)
             logger.error(f"File size check failed: {e}")
             result_content.append({"type": "text", "text": f"Error: {str(e)}"})
             continue
+        file_lower = file_path.lower()
+        if file_lower.endswith((".mp4", ".mov")):
             try:
                 result_content = [*result_content, *process_video(file_path, max_images)]
             except Exception as e:
                 logger.error(f"Video processing failed: {e}")
                 result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
+        elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
+            try:
+                result_content = [*result_content, *process_audio(file_path)]
+            except Exception as e:
+                logger.error(f"Audio processing failed: {e}")
+                result_content.append({"type": "text", "text": f"Error processing audio: {str(e)}"})
+        elif file_lower.endswith(".pdf"):
             try:
                 logger.info(f"Processing PDF file: {file_path}")
                 pdf_text = extract_pdf_text(file_path)
                 content_buffer.append({"type": "text", "text": content})
             elif isinstance(content, tuple) and len(content) > 0:
                 file_path = content[0]
+                file_lower = file_path.lower()
+                if file_lower.endswith((".mp4", ".mov")):
                     content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
+                elif file_lower.endswith((".wav", ".mp3", ".m4a", ".flac", ".ogg")):
+                    content_buffer.append({"type": "text", "text": "[Audio uploaded previously]"})
+                elif file_lower.endswith(".pdf"):
                     content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
                 else:
                     content_buffer.append({"type": "image", "url": file_path})