AIvry commited on
Commit
5b6a83c
Β·
verified Β·
1 Parent(s): c9d5e40

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +327 -0
app.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import zipfile
3
+ import shutil
4
+ from pathlib import Path
5
+ import json
6
+ import os
7
+ import traceback
8
+ import gc
9
+ import torch
10
+ import spaces
11
+
12
+ # Import your modules
13
+ from engine import compute_mapss_measures
14
+ from models import get_model_config, cleanup_all_models
15
+ from config import DEFAULT_ALPHA
16
+ from utils import clear_gpu_memory
17
+
18
+ @spaces.GPU(duration=300)
19
+ def process_audio_files(zip_file, model_name, layer, alpha):
20
+ """Process uploaded ZIP file containing audio mixtures."""
21
+
22
+ if zip_file is None:
23
+ return None, "Please upload a ZIP file"
24
+
25
+ try:
26
+ # Use a fixed extraction path
27
+ extract_path = Path("/tmp/mapss_extract")
28
+ if extract_path.exists():
29
+ shutil.rmtree(extract_path)
30
+ extract_path.mkdir(parents=True)
31
+
32
+ # Extract ZIP
33
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
34
+ zip_ref.extractall(extract_path)
35
+
36
+ # Find references and outputs directories
37
+ refs_dir = None
38
+ outs_dir = None
39
+
40
+ for item in extract_path.iterdir():
41
+ if item.is_dir():
42
+ if item.name.lower() in ['references', 'refs', 'reference']:
43
+ refs_dir = item
44
+ elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
45
+ outs_dir = item
46
+
47
+ # Check one level deeper if not found
48
+ if refs_dir is None or outs_dir is None:
49
+ for item in extract_path.iterdir():
50
+ if item.is_dir():
51
+ for subitem in item.iterdir():
52
+ if subitem.is_dir():
53
+ if subitem.name.lower() in ['references', 'refs', 'reference']:
54
+ refs_dir = subitem
55
+ elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
56
+ outs_dir = subitem
57
+
58
+ if refs_dir is None or outs_dir is None:
59
+ return None, "Could not find 'references' and 'outputs' directories in the ZIP file"
60
+
61
+ # Get audio files
62
+ ref_files = sorted([f for f in refs_dir.glob("*.wav")])
63
+ out_files = sorted([f for f in outs_dir.glob("*.wav")])
64
+
65
+ if len(ref_files) == 0:
66
+ return None, "No reference WAV files found"
67
+ if len(out_files) == 0:
68
+ return None, "No output WAV files found"
69
+ if len(ref_files) != len(out_files):
70
+ return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order."
71
+
72
+ # Create manifest
73
+ manifest = [{
74
+ "mixture_id": "uploaded_mixture",
75
+ "references": [str(f) for f in ref_files],
76
+ "systems": {
77
+ "uploaded_system": [str(f) for f in out_files]
78
+ }
79
+ }]
80
+
81
+ # Validate model
82
+ allowed_models = set(get_model_config(0).keys())
83
+ if model_name not in allowed_models:
84
+ return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}"
85
+
86
+ # Set layer
87
+ if model_name == "raw":
88
+ layer_final = 0
89
+ else:
90
+ model_defaults = {
91
+ "wavlm": 24, "wav2vec2": 24, "hubert": 24,
92
+ "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
93
+ "wav2vec2_xlsr": 24
94
+ }
95
+ layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
96
+
97
+ # Check GPU availability - use all available GPUs on the space
98
+ max_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
99
+
100
+ # Run experiment
101
+ results_dir = compute_mapss_measures(
102
+ models=[model_name],
103
+ mixtures=manifest,
104
+ layer=layer_final,
105
+ alpha=alpha,
106
+ verbose=True,
107
+ max_gpus=max_gpus,
108
+ add_ci=False # Disable CI for faster processing in demo
109
+ )
110
+
111
+ # Create output ZIP at a fixed location
112
+ output_zip = Path("/tmp/mapss_results.zip")
113
+
114
+ with zipfile.ZipFile(output_zip, 'w') as zipf:
115
+ results_path = Path(results_dir)
116
+ files_added = 0
117
+
118
+ # Add all files from results
119
+ for file_path in results_path.rglob("*"):
120
+ if file_path.is_file():
121
+ arcname = str(file_path.relative_to(results_path.parent))
122
+ zipf.write(file_path, arcname)
123
+ files_added += 1
124
+
125
+ if output_zip.exists() and files_added > 0:
126
+ return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
127
+ else:
128
+ return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
129
+
130
+ except Exception as e:
131
+ error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
132
+ return None, error_msg
133
+
134
+ finally:
135
+ cleanup_all_models()
136
+ clear_gpu_memory()
137
+ gc.collect()
138
+
139
+ def create_interface():
140
+ with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
141
+ gr.Markdown("""
142
+ # MAPSS: Manifold-based Assessment of Perceptual Source Separation
143
+
144
+ Granular evaluation of speech and music source separation with the MAPSS measures:
145
+ - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
146
+ - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
147
+
148
+ ## ⚠️ IMPORTANT: File Order Requirements
149
+
150
+ **Output files MUST be in the same order as reference files!**
151
+ - If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
152
+ - Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
153
+ - Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.
154
+
155
+ ## Input Format
156
+
157
+ Upload a ZIP file containing:
158
+ ```
159
+ your_mixture.zip
160
+ β”œβ”€β”€ references/ # Original clean sources
161
+ β”‚ β”œβ”€β”€ speaker1.wav
162
+ β”‚ β”œβ”€β”€ speaker2.wav
163
+ β”‚ └── ...
164
+ └── outputs/ # Separated outputs (SAME ORDER as references)
165
+ β”œβ”€β”€ separated1.wav # Must correspond to speaker1.wav
166
+ β”œβ”€β”€ separated2.wav # Must correspond to speaker2.wav
167
+ └── ...
168
+ ```
169
+
170
+ ### Audio Requirements
171
+ - Format: .wav files
172
+ - Sample rate: Any (automatically resampled to 16kHz)
173
+ - Channels: Mono or stereo (converted to mono)
174
+ - **Number of files: Equal number of references and outputs**
175
+ - **Order: Output files must be in the same order as reference files**
176
+
177
+ ## Output Format
178
+
179
+ The tool generates a ZIP file containing:
180
+ - `ps_scores_{model}.csv`: PS scores for each source over time
181
+ - `pm_scores_{model}.csv`: PM scores for each source over time
182
+ - `params.json`: Parameters used
183
+ - `manifest_canonical.json`: File mapping and processing details
184
+
185
+ ### Score Interpretation
186
+ - **NaN values**: Appear in frames where fewer than 2 speakers are active
187
+ - **Valid scores**: Only computed when at least 2 speakers are active in a frame
188
+ - **Time resolution**: 20ms frames (configurable in code)
189
+
190
+ ## Available Models
191
+
192
+ | Model | Description | Default Layer | Use Case |
193
+ |-------|-------------|---------------|----------|
194
+ | `raw` | Raw waveform features | N/A | Baseline comparison |
195
+ | `wavlm` | WavLM Large | 24 | Strong performance |
196
+ | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
197
+ | `hubert` | HuBERT Large | 24 | Good for speech |
198
+ | `wavlm_base` | WavLM Base | 12 | Faster processing |
199
+ | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
200
+ | `hubert_base` | HuBERT Base | 12 | Faster processing |
201
+ | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
202
+
203
+ ## Parameters
204
+
205
+ - **Model**: Select the embedding model for feature extraction
206
+ - **Layer**: Which transformer layer to use (auto-selected by default)
207
+ - **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
208
+ - 0.0 = No normalization
209
+ - 1.0 = Full normalization (recommended)
210
+
211
+ ## Processing Notes
212
+
213
+ - The system automatically detects which speakers are active in each frame
214
+ - PS/PM scores are only computed between active speakers
215
+ - Processing time scales with number of sources and audio length
216
+ - GPU acceleration is automatically used when available
217
+
218
+ ## Citation
219
+
220
+ If you use MAPSS, please cite:
221
+
222
+ ```bibtex
223
+ @article{Ivry2025MAPSS,
224
+ title = {MAPSS: Manifold-based Assessment of Perceptual Source Separation},
225
+ author = {Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
226
+ journal = {arXiv preprint arXiv:2509.09212},
227
+ year = {2025},
228
+ url = {https://arxiv.org/abs/2509.09212}
229
+ }
230
+ ```
231
+
232
+ ## License
233
+
234
+ Code: MIT License
235
+ Paper: CC-BY-4.0
236
+
237
+ ## Support
238
+
239
+ For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
240
+ """)
241
+
242
+ with gr.Row():
243
+ with gr.Column():
244
+ file_input = gr.File(
245
+ label="Upload ZIP file with audio mixtures",
246
+ file_types=[".zip"],
247
+ type="filepath"
248
+ )
249
+
250
+ model_dropdown = gr.Dropdown(
251
+ choices=["raw", "wavlm", "wav2vec2", "hubert",
252
+ "wavlm_base", "wav2vec2_base", "hubert_base",
253
+ "wav2vec2_xlsr"],
254
+ value="wav2vec2_base",
255
+ label="Select embedding model"
256
+ )
257
+
258
+ layer_slider = gr.Slider(
259
+ minimum=0,
260
+ maximum=12,
261
+ step=1,
262
+ value=12,
263
+ label="Layer (automatically set to model default)",
264
+ interactive=True
265
+ )
266
+
267
+ alpha_slider = gr.Slider(
268
+ minimum=0.0,
269
+ maximum=1.0,
270
+ step=0.1,
271
+ value=DEFAULT_ALPHA,
272
+ label="Diffusion maps alpha parameter"
273
+ )
274
+
275
+ def update_layer_slider(model_name):
276
+ """Update layer slider based on selected model"""
277
+ model_configs = {
278
+ "raw": {"maximum": 0, "value": 0, "interactive": False},
279
+ "wavlm": {"maximum": 24, "value": 24, "interactive": True},
280
+ "wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
281
+ "hubert": {"maximum": 24, "value": 24, "interactive": True},
282
+ "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
283
+ "wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
284
+ "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
285
+ "hubert_base": {"maximum": 12, "value": 12, "interactive": True}
286
+ }
287
+
288
+ config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
289
+ return gr.Slider(
290
+ minimum=0,
291
+ maximum=config["maximum"],
292
+ value=config["value"],
293
+ step=1,
294
+ label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
295
+ interactive=config["interactive"]
296
+ )
297
+
298
+ model_dropdown.change(
299
+ fn=update_layer_slider,
300
+ inputs=[model_dropdown],
301
+ outputs=[layer_slider]
302
+ )
303
+
304
+ process_btn = gr.Button("Process Audio Files", variant="primary")
305
+
306
+ with gr.Column():
307
+ output_file = gr.File(
308
+ label="Download Results (ZIP)",
309
+ type="filepath"
310
+ )
311
+ status_text = gr.Textbox(
312
+ label="Status",
313
+ lines=3,
314
+ max_lines=10
315
+ )
316
+
317
+ process_btn.click(
318
+ fn=process_audio_files,
319
+ inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
320
+ outputs=[output_file, status_text]
321
+ )
322
+
323
+ return demo
324
+
325
+ if __name__ == "__main__":
326
+ demo = create_interface()
327
+ demo.launch()