Spaces:

monitkorn
/

Speech_Analysis

Sleeping

App Files Files Community

monitkorn commited on Jun 21

Commit

353a46c

1 Parent(s): 07c3b9c

update model

Browse files

Files changed (1) hide show

app.py +10 -22

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import tempfile
 import requests
 from moviepy.editor import VideoFileClip
-# Ensure the official OpenAI Whisper package is installed (supports load_model)
 try:
     import whisper
     if not hasattr(whisper, 'load_model'):
@@ -21,29 +20,26 @@ from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
 from huggingface_hub import login
 import gradio as gr
-# Authenticate with Hugging Face (token via HF_TOKEN env var)
-# Device setup (GPU if available)
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
 def load_models():
-    # Load Whisper directly on the target device
-    whisper_model = whisper.load_model('base', device=device)
     processor = Wav2Vec2Processor.from_pretrained(
-        'jonatasgrosman/wav2vec2-large-xlsr-53-english'
     )
     accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(
-        'jonatasgrosman/wav2vec2-large-xlsr-53-english'
     ).to(device)
     return whisper_model, processor, accent_model
 whisper_model, processor, accent_model = load_models()
-# Main analysis function
 def analyze(video_url: str):
-    # Download video to temp file
     with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_vid:
         response = requests.get(video_url, stream=True)
         response.raise_for_status()
@@ -52,23 +48,19 @@ def analyze(video_url: str):
                 tmp_vid.write(chunk)
         video_path = tmp_vid.name
-    # Extract audio
     audio_path = video_path.replace('.mp4', '.wav')
     clip = VideoFileClip(video_path)
     clip.audio.write_audiofile(audio_path, verbose=False, logger=None)
     clip.close()
-    # Load audio waveform
     speech, sr = librosa.load(audio_path, sr=16000)
-    # Transcribe with Whisper (model on correct device)
     result = whisper_model.transcribe(speech)
     transcript = result.get('text', '')
     lang = result.get('language', 'unknown')
     if lang != 'en':
         transcript = f"[Non-English detected: {lang}]\n" + transcript
-        # Accent classification
     inputs = processor(speech, sampling_rate=sr, return_tensors='pt', padding=True)
     input_values = inputs.input_values.to(device)
     attention_mask = inputs.attention_mask.to(device)
@@ -76,20 +68,17 @@ def analyze(video_url: str):
         logits = accent_model(input_values=input_values, attention_mask=attention_mask).logits
         probs = torch.softmax(logits, dim=-1).squeeze().cpu().tolist()
-    # Map default LABEL_x to human-readable accents
     accent_labels = [
         'American', 'Australian', 'British', 'Canadian', 'Indian',
         'Irish', 'New Zealander', 'South African', 'Welsh'
-    ]  # ensure this matches model output order
     accent_probs = [(accent_labels[i], probs[i] * 100) for i in range(len(probs))]
     accent_probs.sort(key=lambda x: x[1], reverse=True)
     top_accent, top_conf = accent_probs[0]
-    # Prepare DataFrame
     df = pd.DataFrame(accent_probs, columns=['Accent', 'Confidence (%)'])
     df = pd.DataFrame(accent_probs, columns=['Accent', 'Confidence (%)'])
-    # Cleanup temp files
     try:
         os.remove(video_path)
         os.remove(audio_path)
@@ -98,7 +87,6 @@ def analyze(video_url: str):
     return top_accent, f"{top_conf:.2f}%", df
-# Gradio interface
 interface = gr.Interface(
     fn=analyze,
     inputs=gr.Textbox(label='Video URL', placeholder='Enter public MP4 URL'),
@@ -109,7 +97,7 @@ interface = gr.Interface(
         gr.Dataframe(label='All Accent Probabilities')
     ],
     title='English Accent Detector',
-    description='Paste a Loom or direct MP4 URL to extract, transcribe, and classify English accents (uses GPU if available).',
     allow_flagging='never'
 )

 import requests
 from moviepy.editor import VideoFileClip
 try:
     import whisper
     if not hasattr(whisper, 'load_model'):
 from huggingface_hub import login
 import gradio as gr
+# device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = 'cpu'
 def load_models():
+    whisper_model = whisper.load_model('tiny', device=device)
     processor = Wav2Vec2Processor.from_pretrained(
+        'jonatasgrosman/wav2vec2-large-english'
     )
     accent_model = Wav2Vec2ForSequenceClassification.from_pretrained(
+        'jonatasgrosman/wav2vec2-large-english'
     ).to(device)
+    accent_model = torch.quantization.quantize_dynamic(
+          accent_model, {torch.nn.Linear}, dtype=torch.qint8
+    )
     return whisper_model, processor, accent_model
 whisper_model, processor, accent_model = load_models()
 def analyze(video_url: str):
     with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_vid:
         response = requests.get(video_url, stream=True)
         response.raise_for_status()
                 tmp_vid.write(chunk)
         video_path = tmp_vid.name
     audio_path = video_path.replace('.mp4', '.wav')
     clip = VideoFileClip(video_path)
     clip.audio.write_audiofile(audio_path, verbose=False, logger=None)
     clip.close()
     speech, sr = librosa.load(audio_path, sr=16000)
     result = whisper_model.transcribe(speech)
     transcript = result.get('text', '')
     lang = result.get('language', 'unknown')
     if lang != 'en':
         transcript = f"[Non-English detected: {lang}]\n" + transcript
     inputs = processor(speech, sampling_rate=sr, return_tensors='pt', padding=True)
     input_values = inputs.input_values.to(device)
     attention_mask = inputs.attention_mask.to(device)
         logits = accent_model(input_values=input_values, attention_mask=attention_mask).logits
         probs = torch.softmax(logits, dim=-1).squeeze().cpu().tolist()
     accent_labels = [
         'American', 'Australian', 'British', 'Canadian', 'Indian',
         'Irish', 'New Zealander', 'South African', 'Welsh'
+    ]
     accent_probs = [(accent_labels[i], probs[i] * 100) for i in range(len(probs))]
     accent_probs.sort(key=lambda x: x[1], reverse=True)
     top_accent, top_conf = accent_probs[0]
     df = pd.DataFrame(accent_probs, columns=['Accent', 'Confidence (%)'])
     df = pd.DataFrame(accent_probs, columns=['Accent', 'Confidence (%)'])
     try:
         os.remove(video_path)
         os.remove(audio_path)
     return top_accent, f"{top_conf:.2f}%", df
 interface = gr.Interface(
     fn=analyze,
     inputs=gr.Textbox(label='Video URL', placeholder='Enter public MP4 URL'),
         gr.Dataframe(label='All Accent Probabilities')
     ],
     title='English Accent Detector',
+    description='Paste a direct MP4 URL to extract, transcribe, and classify English accents. It is a bit slow since we run Whisper and Wav2Vec2 models on CPU. Please test with short videos.',
     allow_flagging='never'
 )