Spaces:

AIvry
/

MAPSS-measures

Sleeping

App Files Files Community

AIvry commited on Sep 15

Commit

5b6a83c

verified ·

1 Parent(s): c9d5e40

Upload app.py

Browse files

Files changed (1) hide show

app.py +327 -0

app.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import gradio as gr
+import zipfile
+import shutil
+from pathlib import Path
+import json
+import os
+import traceback
+import gc
+import torch
+import spaces
+# Import your modules
+from engine import compute_mapss_measures
+from models import get_model_config, cleanup_all_models
+from config import DEFAULT_ALPHA
+from utils import clear_gpu_memory
+@spaces.GPU(duration=300)
+def process_audio_files(zip_file, model_name, layer, alpha):
+    """Process uploaded ZIP file containing audio mixtures."""
+    if zip_file is None:
+        return None, "Please upload a ZIP file"
+    try:
+        # Use a fixed extraction path
+        extract_path = Path("/tmp/mapss_extract")
+        if extract_path.exists():
+            shutil.rmtree(extract_path)
+        extract_path.mkdir(parents=True)
+        # Extract ZIP
+        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+            zip_ref.extractall(extract_path)
+        # Find references and outputs directories
+        refs_dir = None
+        outs_dir = None
+        for item in extract_path.iterdir():
+            if item.is_dir():
+                if item.name.lower() in ['references', 'refs', 'reference']:
+                    refs_dir = item
+                elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
+                    outs_dir = item
+        # Check one level deeper if not found
+        if refs_dir is None or outs_dir is None:
+            for item in extract_path.iterdir():
+                if item.is_dir():
+                    for subitem in item.iterdir():
+                        if subitem.is_dir():
+                            if subitem.name.lower() in ['references', 'refs', 'reference']:
+                                refs_dir = subitem
+                            elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
+                                outs_dir = subitem
+        if refs_dir is None or outs_dir is None:
+            return None, "Could not find 'references' and 'outputs' directories in the ZIP file"
+        # Get audio files
+        ref_files = sorted([f for f in refs_dir.glob("*.wav")])
+        out_files = sorted([f for f in outs_dir.glob("*.wav")])
+        if len(ref_files) == 0:
+            return None, "No reference WAV files found"
+        if len(out_files) == 0:
+            return None, "No output WAV files found"
+        if len(ref_files) != len(out_files):
+            return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order."
+        # Create manifest
+        manifest = [{
+            "mixture_id": "uploaded_mixture",
+            "references": [str(f) for f in ref_files],
+            "systems": {
+                "uploaded_system": [str(f) for f in out_files]
+            }
+        }]
+        # Validate model
+        allowed_models = set(get_model_config(0).keys())
+        if model_name not in allowed_models:
+            return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}"
+        # Set layer
+        if model_name == "raw":
+            layer_final = 0
+        else:
+            model_defaults = {
+                "wavlm": 24, "wav2vec2": 24, "hubert": 24,
+                "wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
+                "wav2vec2_xlsr": 24
+            }
+            layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
+        # Check GPU availability - use all available GPUs on the space
+        max_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
+        # Run experiment
+        results_dir = compute_mapss_measures(
+            models=[model_name],
+            mixtures=manifest,
+            layer=layer_final,
+            alpha=alpha,
+            verbose=True,
+            max_gpus=max_gpus,
+            add_ci=False  # Disable CI for faster processing in demo
+        )
+        # Create output ZIP at a fixed location
+        output_zip = Path("/tmp/mapss_results.zip")
+        with zipfile.ZipFile(output_zip, 'w') as zipf:
+            results_path = Path(results_dir)
+            files_added = 0
+            # Add all files from results
+            for file_path in results_path.rglob("*"):
+                if file_path.is_file():
+                    arcname = str(file_path.relative_to(results_path.parent))
+                    zipf.write(file_path, arcname)
+                    files_added += 1
+        if output_zip.exists() and files_added > 0:
+            return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
+        else:
+            return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
+    except Exception as e:
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        return None, error_msg
+    finally:
+        cleanup_all_models()
+        clear_gpu_memory()
+        gc.collect()
+def create_interface():
+    with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
+        gr.Markdown("""
+        # MAPSS: Manifold-based Assessment of Perceptual Source Separation
+        Granular evaluation of speech and music source separation with the MAPSS measures:
+        - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
+        - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
+        ## ⚠️ IMPORTANT: File Order Requirements
+        **Output files MUST be in the same order as reference files!**
+        - If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
+        - Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
+        - Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.
+        ## Input Format
+        Upload a ZIP file containing:
+        ```
+        your_mixture.zip
+        ├── references/       # Original clean sources
+        │   ├── speaker1.wav
+        │   ├── speaker2.wav
+        │   └── ...
+        └── outputs/         # Separated outputs (SAME ORDER as references)
+            ├── separated1.wav  # Must correspond to speaker1.wav
+            ├── separated2.wav  # Must correspond to speaker2.wav
+            └── ...
+        ```
+        ### Audio Requirements
+        - Format: .wav files
+        - Sample rate: Any (automatically resampled to 16kHz)
+        - Channels: Mono or stereo (converted to mono)
+        - **Number of files: Equal number of references and outputs**
+        - **Order: Output files must be in the same order as reference files**
+        ## Output Format
+        The tool generates a ZIP file containing:
+        - `ps_scores_{model}.csv`: PS scores for each source over time
+        - `pm_scores_{model}.csv`: PM scores for each source over time
+        - `params.json`: Parameters used
+        - `manifest_canonical.json`: File mapping and processing details
+        ### Score Interpretation
+        - **NaN values**: Appear in frames where fewer than 2 speakers are active
+        - **Valid scores**: Only computed when at least 2 speakers are active in a frame
+        - **Time resolution**: 20ms frames (configurable in code)
+        ## Available Models
+        | Model | Description | Default Layer | Use Case |
+        |-------|-------------|---------------|----------|
+        | `raw` | Raw waveform features | N/A | Baseline comparison |
+        | `wavlm` | WavLM Large | 24 | Strong performance |
+        | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
+        | `hubert` | HuBERT Large | 24 | Good for speech |
+        | `wavlm_base` | WavLM Base | 12 | Faster processing |
+        | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
+        | `hubert_base` | HuBERT Base | 12 | Faster processing |
+        | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
+        ## Parameters
+        - **Model**: Select the embedding model for feature extraction
+        - **Layer**: Which transformer layer to use (auto-selected by default)
+        - **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
+          - 0.0 = No normalization
+          - 1.0 = Full normalization (recommended)
+        ## Processing Notes
+        - The system automatically detects which speakers are active in each frame
+        - PS/PM scores are only computed between active speakers
+        - Processing time scales with number of sources and audio length
+        - GPU acceleration is automatically used when available
+        ## Citation
+        If you use MAPSS, please cite:
+        ```bibtex
+        @article{Ivry2025MAPSS,
+          title     = {MAPSS: Manifold-based Assessment of Perceptual Source Separation},
+          author    = {Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
+          journal   = {arXiv preprint arXiv:2509.09212},
+          year      = {2025},
+          url       = {https://arxiv.org/abs/2509.09212}
+        }
+        ```
+        ## License
+        Code: MIT License
+        Paper: CC-BY-4.0
+        ## Support
+        For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
+        """)
+        with gr.Row():
+            with gr.Column():
+                file_input = gr.File(
+                    label="Upload ZIP file with audio mixtures",
+                    file_types=[".zip"],
+                    type="filepath"
+                )
+                model_dropdown = gr.Dropdown(
+                    choices=["raw", "wavlm", "wav2vec2", "hubert",
+                            "wavlm_base", "wav2vec2_base", "hubert_base",
+                            "wav2vec2_xlsr"],
+                    value="wav2vec2_base",
+                    label="Select embedding model"
+                )
+                layer_slider = gr.Slider(
+                    minimum=0,
+                    maximum=12,
+                    step=1,
+                    value=12,
+                    label="Layer (automatically set to model default)",
+                    interactive=True
+                )
+                alpha_slider = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.1,
+                    value=DEFAULT_ALPHA,
+                    label="Diffusion maps alpha parameter"
+                )
+                def update_layer_slider(model_name):
+                    """Update layer slider based on selected model"""
+                    model_configs = {
+                        "raw": {"maximum": 0, "value": 0, "interactive": False},
+                        "wavlm": {"maximum": 24, "value": 24, "interactive": True},
+                        "wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
+                        "hubert": {"maximum": 24, "value": 24, "interactive": True},
+                        "wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
+                        "wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
+                        "wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
+                        "hubert_base": {"maximum": 12, "value": 12, "interactive": True}
+                    }
+                    config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
+                    return gr.Slider(
+                        minimum=0,
+                        maximum=config["maximum"],
+                        value=config["value"],
+                        step=1,
+                        label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
+                        interactive=config["interactive"]
+                    )
+                model_dropdown.change(
+                    fn=update_layer_slider,
+                    inputs=[model_dropdown],
+                    outputs=[layer_slider]
+                )
+                process_btn = gr.Button("Process Audio Files", variant="primary")
+            with gr.Column():
+                output_file = gr.File(
+                    label="Download Results (ZIP)",
+                    type="filepath"
+                )
+                status_text = gr.Textbox(
+                    label="Status",
+                    lines=3,
+                    max_lines=10
+                )
+        process_btn.click(
+            fn=process_audio_files,
+            inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
+            outputs=[output_file, status_text]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()