Azgadel commited on
Commit
b251a32
·
verified ·
1 Parent(s): abe7eaf
Files changed (1) hide show
  1. app.py +329 -360
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
- os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
3
 
4
- import streamlit as st
5
  import torch
6
  import torch.nn as nn
7
  import torch.nn.functional as F
@@ -9,9 +8,7 @@ import soundfile as sf
9
  import torchaudio
10
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
11
  import numpy as np
12
- from pathlib import Path
13
  import json
14
- import tempfile
15
 
16
  # ============================================================
17
  # MODEL DEFINITION
@@ -41,75 +38,22 @@ class Wav2Vec2ForSpeakerEmbedding(nn.Module):
41
 
42
 
43
  # ============================================================
44
- # AUDIO PROCESSING
45
  # ============================================================
46
 
47
- def process_audio(audio_file, feature_extractor, max_length=16000*3):
48
- """Process uploaded audio file"""
49
- try:
50
- # Save uploaded file temporarily
51
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
52
- tmp_file.write(audio_file.getvalue())
53
- tmp_path = tmp_file.name
54
-
55
- # Load audio
56
- waveform, sr = sf.read(tmp_path, dtype='float32')
57
- waveform = torch.from_numpy(waveform)
58
-
59
- # Convert to mono
60
- if len(waveform.shape) > 1:
61
- waveform = torch.mean(waveform, dim=-1)
62
-
63
- # Resample to 16kHz
64
- if sr != 16000:
65
- resampler = torchaudio.transforms.Resample(sr, 16000)
66
- waveform = resampler(waveform)
67
-
68
- # Take middle chunk
69
- if len(waveform) > max_length:
70
- start = (len(waveform) - max_length) // 2
71
- waveform = waveform[start:start + max_length]
72
- elif len(waveform) < max_length:
73
- padding = max_length - len(waveform)
74
- waveform = torch.nn.functional.pad(waveform, (0, padding))
75
-
76
- # Normalize
77
- if waveform.abs().max() > 0:
78
- waveform = waveform / waveform.abs().max()
79
-
80
- # Extract features
81
- inputs = feature_extractor(
82
- waveform.numpy(),
83
- sampling_rate=16000,
84
- return_tensors="pt"
85
- )
86
-
87
- # Cleanup
88
- os.unlink(tmp_path)
89
-
90
- return inputs.input_values, waveform.numpy(), sr
91
-
92
- except Exception as e:
93
- st.error(f"Error processing audio: {e}")
94
- return None, None, None
95
 
 
 
 
 
 
96
 
97
- def get_embedding(model, audio_file, feature_extractor, device):
98
- """Extract embedding from audio file"""
99
- inputs, waveform, sr = process_audio(audio_file, feature_extractor)
100
- if inputs is None:
101
- return None
102
-
103
- model.eval()
104
- with torch.no_grad():
105
- inputs = inputs.to(device)
106
- embedding = model(inputs)
107
-
108
- return embedding.cpu().numpy()
109
 
110
 
111
  # ============================================================
112
- # ENROLLMENT DATABASE
113
  # ============================================================
114
 
115
  class EnrollmentDB:
@@ -135,10 +79,6 @@ class EnrollmentDB:
135
  self.save_db()
136
 
137
  def verify(self, embedding, threshold=0.75):
138
- """
139
- Verify against all enrolled users
140
- Returns: (best_match_name, similarity_score, is_verified)
141
- """
142
  if not self.enrollments:
143
  return None, 0.0, False
144
 
@@ -156,12 +96,14 @@ class EnrollmentDB:
156
  best_match = name
157
 
158
  is_verified = best_score >= threshold
159
-
160
  return best_match, best_score, is_verified
161
 
162
  def get_all_users(self):
163
  return list(self.enrollments.keys())
164
 
 
 
 
165
  def remove_user(self, name):
166
  if name in self.enrollments:
167
  del self.enrollments[name]
@@ -169,326 +111,353 @@ class EnrollmentDB:
169
  return True
170
  return False
171
 
 
 
172
 
173
  # ============================================================
174
- # STREAMLIT APP
175
  # ============================================================
176
 
177
- @st.cache_resource
178
- def load_model():
179
- """Load model once and cache it"""
180
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- model = Wav2Vec2ForSpeakerEmbedding(embedding_size=256).to(device)
183
- checkpoint = torch.load('best_embedding_model.pth', map_location=device)
184
- model.load_state_dict(checkpoint['model_state_dict'])
 
 
 
185
  model.eval()
186
-
187
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
188
-
189
- return model, feature_extractor, device
 
 
190
 
 
 
 
191
 
192
- def main():
193
- st.set_page_config(
194
- page_title="Voice Biometry Demo",
195
- page_icon="🎤",
196
- layout="wide"
197
- )
198
 
199
- # Custom CSS
200
- st.markdown("""
201
- <style>
202
- .big-font {
203
- font-size:20px !important;
204
- font-weight: bold;
205
- }
206
- .success-box {
207
- padding: 20px;
208
- border-radius: 10px;
209
- background-color: #d4edda;
210
- border: 2px solid #28a745;
211
- color: #155724;
212
- }
213
- .failure-box {
214
- padding: 20px;
215
- border-radius: 10px;
216
- background-color: #f8d7da;
217
- border: 2px solid #dc3545;
218
- color: #721c24;
219
- }
220
- .info-box {
221
- padding: 20px;
222
- border-radius: 10px;
223
- background-color: #d1ecf1;
224
- border: 2px solid #17a2b8;
225
- color: #0c5460;
226
- }
227
- </style>
228
- """, unsafe_allow_html=True)
229
 
230
- # Header
231
- st.title("Voice Biometry System - Proof of Concept")
232
- st.markdown("### Finetuned Wav2Vec 2.0")
233
 
234
- # Load model
235
- with st.spinner("Loading model..."):
236
- model, feature_extractor, device = load_model()
237
 
238
- # Initialize database
239
- db = EnrollmentDB()
 
 
 
 
 
 
 
 
 
 
240
 
241
- # Sidebar - Configuration
242
- st.sidebar.header("⚙️ Configuration")
243
- threshold = st.sidebar.slider(
244
- "Verification Threshold",
245
- min_value=0.5,
246
- max_value=0.95,
247
- value=0.75,
248
- step=0.05,
249
- help="Higher = more strict verification"
250
- )
251
 
252
- st.sidebar.markdown("---")
253
- st.sidebar.header("📊 System Stats")
254
- st.sidebar.metric("Enrolled Users", len(db.get_all_users()))
255
- st.sidebar.metric("Model Accuracy", "76%")
256
- st.sidebar.metric("AUC Score", "0.82")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
- # Enrolled users list
259
- if db.get_all_users():
260
- st.sidebar.markdown("---")
261
- st.sidebar.header("👥 Enrolled Users")
262
- for user in db.get_all_users():
263
- col1, col2 = st.sidebar.columns([3, 1])
264
- col1.write(f"• {user}")
265
- if col2.button("🗑️", key=f"del_{user}"):
266
- db.remove_user(user)
267
- st.rerun()
 
268
 
269
- # Main tabs
270
- tab1, tab2, tab3 = st.tabs(["📝 Enrollment", "✅ Verification", "ℹ️ About"])
 
 
271
 
272
- # ============================================================
273
- # TAB 1: ENROLLMENT
274
- # ============================================================
275
- with tab1:
276
- st.header("Enroll a New User")
277
- st.markdown("Upload a voice recording to register a new user in the system.")
278
-
279
- col1, col2 = st.columns([2, 1])
280
-
281
- with col1:
282
- enroll_name = st.text_input(
283
- "User Name",
284
- placeholder="Enter name (e.g., Abdou Diop)",
285
- help="This name will be used to identify the speaker"
286
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
- enroll_audio = st.file_uploader(
289
- "Upload Voice Recording",
290
- type=['wav', 'mp3', 'flac', 'ogg'],
291
- help="Upload a clear voice recording (3-20 seconds recommended)",
292
- key="enroll"
 
293
  )
294
 
295
- with col2:
296
- st.info("""
297
- **Enrollment Tips:**
298
- - Use clear audio
299
- - 3-20 seconds long
300
- - Minimal background noise
301
- - Normal speaking voice
302
- """)
303
-
304
- if st.button("🎯 Enroll User", type="primary", disabled=(not enroll_name or not enroll_audio)):
305
- with st.spinner(f"Processing enrollment for {enroll_name}..."):
306
- # Check if user already exists
307
- if enroll_name in db.get_all_users():
308
- st.warning(f"⚠️ User '{enroll_name}' already exists. Please use a different name or remove the existing user first.")
309
- else:
310
- # Get embedding
311
- embedding = get_embedding(model, enroll_audio, feature_extractor, device)
312
-
313
- if embedding is not None:
314
- # Save enrollment
315
- db.enroll(enroll_name, embedding)
316
-
317
- st.markdown(f"""
318
- <div class="success-box">
319
- <h3>✅ Enrollment Successful!</h3>
320
- <p><strong>{enroll_name}</strong> has been enrolled in the system.</p>
321
- <p>Total enrolled users: {len(db.get_all_users())}</p>
322
- </div>
323
- """, unsafe_allow_html=True)
324
-
325
- #st.balloons()
326
- else:
327
- st.error("❌ Failed to process audio. Please try again with a different recording.")
328
-
329
- # ============================================================
330
- # TAB 2: VERIFICATION
331
- # ============================================================
332
- with tab2:
333
- st.header("Verify User Identity")
334
- st.markdown("Upload a voice recording to verify against enrolled users.")
335
-
336
- if not db.get_all_users():
337
- st.warning("⚠️ No users enrolled yet. Please enroll at least one user first.")
338
- else:
339
- col1, col2 = st.columns([2, 1])
340
 
341
- with col1:
342
- verify_audio = st.file_uploader(
343
- "Upload Voice Recording for Verification",
344
- type=['wav', 'mp3', 'flac', 'ogg'],
345
- help="Upload a voice recording from a speaker you want to verify",
346
- key="verify"
347
- )
 
 
 
 
 
 
 
 
 
348
 
349
- with col2:
350
- st.info(f"""
351
- **Verification Info:**
352
- - {len(db.get_all_users())} users enrolled
353
- - Threshold: {threshold:.2f}
354
- - Model: Wav2Vec 2.0
355
- """)
356
 
357
- if st.button("🔍 Verify Identity", type="primary", disabled=(not verify_audio)):
358
- with st.spinner("Analyzing voice..."):
359
- # Get embedding
360
- embedding = get_embedding(model, verify_audio, feature_extractor, device)
361
-
362
- if embedding is not None:
363
- # Verify
364
- match_name, similarity, is_verified = db.verify(embedding, threshold)
365
-
366
- # Display results
367
- st.markdown("---")
368
-
369
- if is_verified:
370
- st.markdown(f"""
371
- <div class="success-box">
372
- <h2>✅ VERIFICATION SUCCESSFUL</h2>
373
- <h3>Identified as: {match_name}</h3>
374
- <p style="font-size: 18px;">Confidence Score: <strong>{similarity:.1%}</strong></p>
375
- </div>
376
- """, unsafe_allow_html=True)
377
-
378
- st.success(f"🎉 Welcome back, {match_name}!")
379
-
380
- else:
381
- st.markdown(f"""
382
- <div class="failure-box">
383
- <h2>❌ VERIFICATION FAILED</h2>
384
- <p>Closest match: <strong>{match_name}</strong></p>
385
- <p>Similarity: <strong>{similarity:.1%}</strong></p>
386
- <p>Threshold required: <strong>{threshold:.1%}</strong></p>
387
- <p><em>This speaker is not recognized in the system.</em></p>
388
- </div>
389
- """, unsafe_allow_html=True)
390
-
391
- # Show all scores
392
- with st.expander("📊 See detailed scores for all enrolled users"):
393
- st.markdown("### Similarity Scores")
394
-
395
- scores = []
396
- embedding_tensor = torch.from_numpy(embedding)
397
-
398
- for name, enrolled_emb in db.enrollments.items():
399
- enrolled_tensor = torch.from_numpy(enrolled_emb)
400
- sim = F.cosine_similarity(embedding_tensor, enrolled_tensor, dim=1).item()
401
- scores.append({
402
- 'User': name,
403
- 'Similarity': f"{sim:.1%}",
404
- 'Status': '✅ Match' if sim >= threshold else '❌ No match'
405
- })
406
-
407
- # Sort by similarity
408
- scores.sort(key=lambda x: x['Similarity'], reverse=True)
409
-
410
- import pandas as pd
411
- df = pd.DataFrame(scores)
412
- st.dataframe(df, use_container_width=True, hide_index=True)
413
-
414
- else:
415
- st.error("❌ Failed to process audio. Please try again with a different recording.")
416
-
417
- # ============================================================
418
- # TAB 3: ABOUT
419
- # ============================================================
420
- with tab3:
421
- st.header("About This System")
422
 
423
- col1, col2 = st.columns(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
- with col1:
426
- st.markdown("""
427
- ### 🎯 Technology
 
428
 
429
- **Model Architecture:**
430
  - Base: Wav2Vec 2.0 (Facebook AI)
431
- - Finetuned on 247 speakers
432
- - 1035 voice samples (telephone quality, 8kHz)
433
- - Embedding dimension: 256
434
 
435
- **Training Details:**
436
  - Loss: Supervised Contrastive Learning
437
  - Framework: PyTorch + Transformers
438
- - Training time: ~50 epochs
439
- - Hardware: NVIDIA RTX 3050
440
- """)
441
-
442
- with col2:
443
- st.markdown("""
444
- ### 📊 Performance Metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
 
446
- **Evaluation Results:**
447
- - **Accuracy:** 76%
448
- - **AUC Score:** 0.82
449
- - **True Positive Rate:** 79%
450
- - **False Positive Rate:** 27%
451
 
452
- **Test Set:**
453
- - 1000 verification pairs
454
- - 500 same-speaker pairs
455
- - 500 different-speaker pairs
 
 
 
 
456
  """)
457
-
458
- st.markdown("---")
459
-
460
- st.markdown("""
461
- ### 🔧 How It Works
462
-
463
- 1. **Enrollment Phase:**
464
- - User uploads voice recording
465
- - System extracts 256-dimensional embedding
466
- - Embedding stored in database with user name
467
-
468
- 2. **Verification Phase:**
469
- - Unknown voice recording uploaded
470
- - System extracts embedding
471
- - Computes cosine similarity with all enrolled users
472
- - Returns match if similarity exceeds threshold
473
-
474
- 3. **Matching Algorithm:**
475
- - Cosine similarity between embeddings
476
- - Range: -1 (opposite) to +1 (identical)
477
- - Typical same-speaker: 0.75-0.95
478
- - Typical different-speaker: 0.30-0.70
479
- """)
480
-
481
- st.markdown("---")
482
-
483
- st.info("""
484
- **Note:** This is a proof of concept system. For production deployment, consider:
485
- - Larger training dataset (10-20 samples per speaker)
486
- - Better base model (WavLM for noisy conditions)
487
- - Anti-spoofing measures
488
- - Liveness detection
489
- - Multi-enrollment (average multiple recordings per user)
490
- """)
491
-
492
-
493
- if __name__ == "__main__":
494
- main()
 
1
  import os
 
2
 
3
+ import gradio as gr
4
  import torch
5
  import torch.nn as nn
6
  import torch.nn.functional as F
 
8
  import torchaudio
9
  from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
10
  import numpy as np
 
11
  import json
 
12
 
13
  # ============================================================
14
  # MODEL DEFINITION
 
38
 
39
 
40
  # ============================================================
41
+ # GLOBAL SETUP
42
  # ============================================================
43
 
44
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Load model
47
+ model = Wav2Vec2ForSpeakerEmbedding(embedding_size=256).to(device)
48
+ checkpoint = torch.load('best_embedding_model.pth', map_location=device)
49
+ model.load_state_dict(checkpoint['model_state_dict'])
50
+ model.eval()
51
 
52
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base")
 
 
 
 
 
 
 
 
 
 
 
53
 
54
 
55
  # ============================================================
56
+ # DATABASE
57
  # ============================================================
58
 
59
  class EnrollmentDB:
 
79
  self.save_db()
80
 
81
  def verify(self, embedding, threshold=0.75):
 
 
 
 
82
  if not self.enrollments:
83
  return None, 0.0, False
84
 
 
96
  best_match = name
97
 
98
  is_verified = best_score >= threshold
 
99
  return best_match, best_score, is_verified
100
 
101
  def get_all_users(self):
102
  return list(self.enrollments.keys())
103
 
104
+ def get_user_count(self):
105
+ return len(self.enrollments)
106
+
107
  def remove_user(self, name):
108
  if name in self.enrollments:
109
  del self.enrollments[name]
 
111
  return True
112
  return False
113
 
114
+ db = EnrollmentDB()
115
+
116
 
117
  # ============================================================
118
+ # AUDIO PROCESSING
119
  # ============================================================
120
 
121
+ def process_audio(audio_path, max_length=16000*3):
122
+ """Process audio file"""
123
+ try:
124
+ waveform, sr = sf.read(audio_path, dtype='float32')
125
+ waveform = torch.from_numpy(waveform)
126
+
127
+ if len(waveform.shape) > 1:
128
+ waveform = torch.mean(waveform, dim=-1)
129
+
130
+ if sr != 16000:
131
+ resampler = torchaudio.transforms.Resample(sr, 16000)
132
+ waveform = resampler(waveform)
133
+
134
+ if len(waveform) > max_length:
135
+ start = (len(waveform) - max_length) // 2
136
+ waveform = waveform[start:start + max_length]
137
+ elif len(waveform) < max_length:
138
+ padding = max_length - len(waveform)
139
+ waveform = torch.nn.functional.pad(waveform, (0, padding))
140
+
141
+ if waveform.abs().max() > 0:
142
+ waveform = waveform / waveform.abs().max()
143
+
144
+ inputs = feature_extractor(
145
+ waveform.numpy(),
146
+ sampling_rate=16000,
147
+ return_tensors="pt"
148
+ )
149
+
150
+ return inputs.input_values
151
 
152
+ except Exception as e:
153
+ raise ValueError(f"Error processing audio: {e}")
154
+
155
+
156
+ def get_embedding(audio_path):
157
+ """Extract embedding from audio"""
158
  model.eval()
159
+ with torch.no_grad():
160
+ inputs = process_audio(audio_path)
161
+ inputs = inputs.to(device)
162
+ embedding = model(inputs)
163
+ return embedding.cpu().numpy()
164
+
165
 
166
+ # ============================================================
167
+ # GRADIO FUNCTIONS
168
+ # ============================================================
169
 
170
+ def enroll_user(name, audio, threshold):
171
+ """Enroll a new user"""
172
+ if not name or not name.strip():
173
+ return "❌ Veuillez entrer un nom.", get_user_list(), get_stats()
 
 
174
 
175
+ if not audio:
176
+ return "❌ Veuillez uploader un enregistrement audio.", get_user_list(), get_stats()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ name = name.strip()
 
 
179
 
180
+ if name in db.get_all_users():
181
+ return f"⚠️ L'utilisateur '{name}' existe déjà.", get_user_list(), get_stats()
 
182
 
183
+ try:
184
+ embedding = get_embedding(audio)
185
+ db.enroll(name, embedding)
186
+ return f"✅ Enregistrement réussi!\n\n👤 {name} a été enregistré dans le système.\n📊 Total utilisateurs: {db.get_user_count()}", get_user_list(), get_stats()
187
+ except Exception as e:
188
+ return f"❌ Erreur: {str(e)}", get_user_list(), get_stats()
189
+
190
+
191
+ def verify_user(audio, threshold):
192
+ """Verify a user"""
193
+ if not audio:
194
+ return "❌ Veuillez uploader un enregistrement audio.", ""
195
 
196
+ if db.get_user_count() == 0:
197
+ return "⚠️ Aucun utilisateur enregistré. Veuillez d'abord enregistrer des utilisateurs.", ""
 
 
 
 
 
 
 
 
198
 
199
+ try:
200
+ embedding = get_embedding(audio)
201
+ match_name, similarity, is_verified = db.verify(embedding, threshold)
202
+
203
+ # Build detailed results
204
+ details = "📊 **Scores détaillés:**\n\n"
205
+ embedding_tensor = torch.from_numpy(embedding)
206
+
207
+ scores = []
208
+ for name, enrolled_emb in db.enrollments.items():
209
+ enrolled_tensor = torch.from_numpy(enrolled_emb)
210
+ sim = F.cosine_similarity(embedding_tensor, enrolled_tensor, dim=1).item()
211
+ status = "✅" if sim >= threshold else "❌"
212
+ scores.append((name, sim, status))
213
+
214
+ scores.sort(key=lambda x: x[1], reverse=True)
215
+
216
+ for name, sim, status in scores:
217
+ details += f"{status} **{name}**: {sim:.1%}\n"
218
+
219
+ if is_verified:
220
+ result = f"""
221
+ # ✅ VÉRIFICATION RÉUSSIE
222
+
223
+ ## Identifié comme: **{match_name}**
224
+ ### Score de confiance: **{similarity:.1%}**
225
+
226
+ ---
227
+ """
228
+ return result + details, details
229
+ else:
230
+ result = f"""
231
+ # ❌ VÉRIFICATION ÉCHOUÉE
232
+
233
+ Meilleure correspondance: **{match_name}**
234
+ Similarité: **{similarity:.1%}**
235
+ Seuil requis: **{threshold:.1%}**
236
+
237
+ *Cette voix n'est pas reconnue dans le système.*
238
+
239
+ ---
240
+ """
241
+ return result + details, details
242
+
243
+ except Exception as e:
244
+ return f"❌ Erreur: {str(e)}", ""
245
+
246
+
247
+ def get_user_list():
248
+ """Get list of enrolled users"""
249
+ users = db.get_all_users()
250
+ if not users:
251
+ return "Aucun utilisateur enregistré"
252
+ return "\n".join([f"• {user}" for user in sorted(users)])
253
+
254
+
255
+ def get_stats():
256
+ """Get system statistics"""
257
+ return f"""
258
+ **📊 Statistiques du système:**
259
+ - Utilisateurs enregistrés: {db.get_user_count()}
260
+ - Précision du modèle: 76%
261
+ - Score AUC: 0.82
262
+ - Architecture: Wav2Vec 2.0
263
+ """
264
+
265
+
266
+ def delete_user(name):
267
+ """Delete a user"""
268
+ if not name or not name.strip():
269
+ return "❌ Veuillez sélectionner un utilisateur.", get_user_list(), get_stats()
270
 
271
+ if db.remove_user(name.strip()):
272
+ return f"✅ Utilisateur '{name}' supprimé.", get_user_list(), get_stats()
273
+ else:
274
+ return f" Utilisateur '{name}' non trouvé.", get_user_list(), get_stats()
275
+
276
+
277
+ # ============================================================
278
+ # GRADIO INTERFACE
279
+ # ============================================================
280
+
281
+ with gr.Blocks(title="Biométrie Vocale - POC", theme=gr.themes.Soft()) as demo:
282
 
283
+ gr.Markdown("""
284
+ # 🎤 Système de Biométrie Vocale
285
+ ### Proof of Concept - Wav2Vec 2.0 Fine-tuné
286
+ """)
287
 
288
+ with gr.Row():
289
+ with gr.Column(scale=2):
290
+ stats_display = gr.Markdown(get_stats())
291
+ with gr.Column(scale=1):
292
+ threshold = gr.Slider(
293
+ minimum=0.5,
294
+ maximum=0.95,
295
+ value=0.75,
296
+ step=0.05,
297
+ label="Seuil de vérification",
298
+ info="Plus élevé = vérification plus stricte"
 
 
 
299
  )
300
+
301
+ with gr.Tabs():
302
+ # TAB 1: ENROLLMENT
303
+ with gr.Tab("📝 Enregistrement"):
304
+ gr.Markdown("### Enregistrer un nouvel utilisateur")
305
+
306
+ with gr.Row():
307
+ with gr.Column():
308
+ enroll_name_input = gr.Textbox(
309
+ label="Nom de l'utilisateur",
310
+ placeholder="Ex: Jean Dupont"
311
+ )
312
+ enroll_audio_input = gr.Audio(
313
+ label="Enregistrement vocal",
314
+ type="filepath",
315
+ sources=["upload", "microphone"]
316
+ )
317
+ enroll_button = gr.Button("🎯 Enregistrer", variant="primary")
318
+
319
+ with gr.Column():
320
+ gr.Markdown("""
321
+ **💡 Conseils:**
322
+ - Audio clair et net
323
+ - 3-20 secondes recommandées
324
+ - Bruit de fond minimal
325
+ - Voix normale
326
+ """)
327
+ enrolled_users = gr.Textbox(
328
+ label="Utilisateurs enregistrés",
329
+ value=get_user_list(),
330
+ lines=8,
331
+ interactive=False
332
+ )
333
 
334
+ enroll_output = gr.Markdown()
335
+
336
+ enroll_button.click(
337
+ fn=enroll_user,
338
+ inputs=[enroll_name_input, enroll_audio_input, threshold],
339
+ outputs=[enroll_output, enrolled_users, stats_display]
340
  )
341
 
342
+ # TAB 2: VERIFICATION
343
+ with gr.Tab("✅ Vérification"):
344
+ gr.Markdown("### Vérifier l'identité d'un utilisateur")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
 
346
+ with gr.Row():
347
+ with gr.Column():
348
+ verify_audio_input = gr.Audio(
349
+ label="Enregistrement vocal à vérifier",
350
+ type="filepath",
351
+ sources=["upload", "microphone"]
352
+ )
353
+ verify_button = gr.Button("🔍 Vérifier", variant="primary")
354
+
355
+ with gr.Column():
356
+ gr.Markdown(f"""
357
+ **ℹ️ Information:**
358
+ - {db.get_user_count()} utilisateur(s) enregistré(s)
359
+ - Seuil: ajustable dans le slider ci-dessus
360
+ - Modèle: Wav2Vec 2.0
361
+ """)
362
 
363
+ verify_output = gr.Markdown()
364
+ verify_details = gr.Markdown()
 
 
 
 
 
365
 
366
+ verify_button.click(
367
+ fn=verify_user,
368
+ inputs=[verify_audio_input, threshold],
369
+ outputs=[verify_output, verify_details]
370
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
+ # TAB 3: MANAGEMENT
373
+ with gr.Tab("⚙️ Gestion"):
374
+ gr.Markdown("### Gérer les utilisateurs enregistrés")
375
+
376
+ with gr.Row():
377
+ with gr.Column():
378
+ delete_name_input = gr.Textbox(
379
+ label="Nom de l'utilisateur à supprimer",
380
+ placeholder="Ex: Jean Dupont"
381
+ )
382
+ delete_button = gr.Button("🗑️ Supprimer", variant="stop")
383
+
384
+ with gr.Column():
385
+ delete_users_list = gr.Textbox(
386
+ label="Utilisateurs enregistrés",
387
+ value=get_user_list(),
388
+ lines=8,
389
+ interactive=False
390
+ )
391
+
392
+ delete_output = gr.Markdown()
393
+
394
+ delete_button.click(
395
+ fn=delete_user,
396
+ inputs=[delete_name_input],
397
+ outputs=[delete_output, delete_users_list, stats_display]
398
+ )
399
 
400
+ # TAB 4: ABOUT
401
+ with gr.Tab("ℹ️ À propos"):
402
+ gr.Markdown("""
403
+ ## 🎯 Technologie
404
 
405
+ **Architecture du modèle:**
406
  - Base: Wav2Vec 2.0 (Facebook AI)
407
+ - Fine-tuné sur 247 locuteurs
408
+ - 1035 échantillons vocaux (qualité téléphonique, 8kHz)
409
+ - Dimension d'embedding: 256
410
 
411
+ **Détails d'entraînement:**
412
  - Loss: Supervised Contrastive Learning
413
  - Framework: PyTorch + Transformers
414
+ - Durée d'entraînement: ~50 epochs
415
+ - Matériel: NVIDIA RTX 3050
416
+
417
+ ---
418
+
419
+ ## 📊 Métriques de Performance
420
+
421
+ **Résultats d'évaluation:**
422
+ - **Précision:** 76%
423
+ - **Score AUC:** 0.82
424
+ - **Taux de vrais positifs:** 79%
425
+ - **Taux de faux positifs:** 27%
426
+
427
+ **Ensemble de test:**
428
+ - 1000 paires de vérification
429
+ - 500 paires même locuteur
430
+ - 500 paires locuteurs différents
431
+
432
+ ---
433
+
434
+ ## 🔧 Fonctionnement
435
+
436
+ 1. **Phase d'enregistrement:**
437
+ - L'utilisateur uploade un enregistrement vocal
438
+ - Le système extrait un embedding de dimension 256
439
+ - L'embedding est stocké dans la base de données
440
+
441
+ 2. **Phase de vérification:**
442
+ - Enregistrement vocal inconnu uploadé
443
+ - Le système extrait l'embedding
444
+ - Calcul de similarité cosinus avec tous les utilisateurs enregistrés
445
+ - Correspondance si similarité > seuil
446
 
447
+ 3. **Algorithme de correspondance:**
448
+ - Similarité cosinus entre embeddings
449
+ - Plage: -1 (opposé) à +1 (identique)
450
+ - Même locuteur typique: 0.75-0.95
451
+ - Locuteurs différents typique: 0.30-0.70
452
 
453
+ ---
454
+
455
+ **Note:** Ceci est un système proof of concept. Pour un déploiement en production, considérer:
456
+ - Dataset plus large (10-20 échantillons par locuteur)
457
+ - Meilleur modèle de base (WavLM pour conditions bruitées)
458
+ - Mesures anti-spoofing
459
+ - Détection de vivacité
460
+ - Multi-enregistrement (moyenne de plusieurs enregistrements par utilisateur)
461
  """)
462
+
463
+ demo.launch(share=False)