blanchon commited on
Commit
f875353
·
1 Parent(s): f87238b
Files changed (9) hide show
  1. .gitignore +219 -0
  2. README.md +57 -12
  3. app.py +204 -0
  4. cli.py +182 -0
  5. notebooks/latent_analysis.ipynb +903 -0
  6. pyproject.toml +21 -0
  7. requirements.txt +280 -0
  8. uv.lock +0 -0
  9. visualize.py +281 -0
.gitignore ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+
204
+ # Ruff stuff:
205
+ .ruff_cache/
206
+
207
+ # PyPI configuration file
208
+ .pypirc
209
+
210
+ # Marimo
211
+ marimo/_static/
212
+ marimo/_lsp/
213
+ __marimo__/
214
+
215
+ # Streamlit
216
+ .streamlit/secrets.toml
217
+
218
+ outputs/
219
+ resources/
README.md CHANGED
@@ -1,12 +1,57 @@
1
- ---
2
- title: Motion Latent Diffusion Standalone Demo
3
- emoji: 📊
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Demo
2
+
3
+ Command-line and web interfaces for motion-latent-diffusion-standalone.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ cd demo
9
+ pip install -e .
10
+ ```
11
+
12
+ ## Command Line
13
+
14
+ ```bash
15
+ # Generate motion
16
+ python cli.py --text "a person walks forward" --length 100
17
+
18
+ # Options
19
+ python cli.py --text "jumping" --length 120 --output ./outputs/ --no-video
20
+ ```
21
+
22
+ Outputs:
23
+
24
+ - `*.pt` - Motion tensor (frames, 22, 3)
25
+ - `*.latent.pt` - Latent representation
26
+ - `*.mp4` - Visualization video
27
+ - `*.txt` - Text prompt
28
+
29
+ ## Web Interface
30
+
31
+ ```bash
32
+ python app.py
33
+ ```
34
+
35
+ Opens at `http://localhost:7860`
36
+
37
+ ## Visualization
38
+
39
+ ```bash
40
+ # Create video from saved motion
41
+ python visualize.py motion.pt --output video.mp4 --fps 20
42
+ ```
43
+
44
+ ## Python API
45
+
46
+ ```python
47
+ from motion_latent_diffusion_standalone import MotionLatentDiffusionModel
48
+ from visualize import create_video_from_joints
49
+
50
+ model = MotionLatentDiffusionModel(
51
+ vae_repo_id="blanchon/motion-latent-diffusion-standalone-vae",
52
+ denoiser_repo_id="blanchon/motion-latent-diffusion-standalone-denoiser"
53
+ )
54
+
55
+ joints = model.generate("a person walks", length=100) # (100, 22, 3)
56
+ create_video_from_joints(joints, "output.mp4", fps=20)
57
+ ```
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import gradio as gr
3
+ import torch
4
+ from datetime import datetime
5
+ import tempfile
6
+ from tqdm import tqdm
7
+ from textwrap import dedent
8
+ import spaces
9
+
10
+
11
+ from motion_latent_diffusion_standalone import MotionLatentDiffusionModel
12
+ from visualize import create_video_from_joints
13
+
14
+
15
+ model = MotionLatentDiffusionModel(
16
+ vae_repo_id="blanchon/motion-latent-diffusion-standalone-vae",
17
+ denoiser_repo_id="blanchon/motion-latent-diffusion-standalone-denoiser",
18
+ text_encoder_repo_id="openai/clip-vit-large-patch14",
19
+ )
20
+ model.to("cuda")
21
+ model.eval()
22
+ model.requires_grad_(False)
23
+
24
+
25
+ @spaces.gpu()
26
+ def generate_motion(
27
+ text_prompt: str, motion_length: int, progress=gr.Progress(track_tqdm=True)
28
+ ) -> tuple[Path, str, Path]:
29
+ try:
30
+ # Create temporary files
31
+ temp_dir = tempfile.gettempdir()
32
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
33
+ filename = f"motion_{timestamp}"
34
+
35
+ pt_path = Path(temp_dir) / f"{filename}.pt"
36
+ video_path = Path(temp_dir) / f"{filename}.mp4"
37
+
38
+ print("🎬 Generating motion...")
39
+ with tqdm(
40
+ total=motion_length,
41
+ desc="Generating motion",
42
+ # disable=not progress.is_tracked(),
43
+ ) as pbar:
44
+
45
+ def callback_on_step_end(i: int, latents: torch.Tensor):
46
+ pbar.update(i)
47
+
48
+ # Generate motion (returns PyTorch tensor)
49
+ joints, latent = model.generate(
50
+ text_prompt,
51
+ motion_length,
52
+ return_latent=True,
53
+ callback_on_step_end=callback_on_step_end,
54
+ )
55
+
56
+ # Save motion data as PyTorch tensor
57
+ torch.save(joints, pt_path)
58
+
59
+ print("🎥 Creating visualization...")
60
+
61
+ # Create video visualization
62
+ video_path = create_video_from_joints(joints, video_path.as_posix(), fps=20)
63
+
64
+ print("✅ Done!")
65
+
66
+ # Generate info text
67
+ info_text = dedent("""
68
+ ✅ **Generation Complete!**
69
+
70
+ **Prompt:** {text_prompt}
71
+ **Motion Length:** {motion_length} frames ({motion_length / 20:.1f}s at 20fps)
72
+ **Output Shape:** {joints.shape} (frames × joints × coords)
73
+
74
+ The video shows a 3D skeleton performing the motion.
75
+ You can download both the video and the raw motion data below.
76
+ """)
77
+
78
+ return video_path, info_text, pt_path.as_posix()
79
+
80
+ except Exception as e:
81
+ error_msg = f"Error during generation: {str(e)}"
82
+ import traceback
83
+
84
+ traceback.print_exc()
85
+ return None, error_msg, None
86
+
87
+
88
+ def create_example_prompts():
89
+ """Return example prompts for the interface"""
90
+ return [
91
+ ["a person walks forward slowly", 80],
92
+ ["jumping up and down", 100],
93
+ ["a person waves hello", 60],
94
+ ["running in place", 100],
95
+ ["a person does jumping jacks", 120],
96
+ ["someone performs a cartwheel", 140],
97
+ ["walking backwards carefully", 90],
98
+ ["a person stretches their arms", 80],
99
+ ]
100
+
101
+
102
+ with gr.Blocks(title="MLD Text-to-Motion Generator", theme=gr.themes.Soft()) as demo:
103
+ # Header
104
+ gr.Markdown("""
105
+ # 🎬 MLD Text-to-Motion Generator
106
+
107
+ Generate realistic human motion animations from text descriptions!
108
+ Powered by Motion Latent Diffusion (MLD).
109
+
110
+ ### 💡 Tips for Best Results:
111
+ - Be specific: "a person walks forward slowly" works better than just "walking"
112
+ - Use present tense: "walks" or "is walking"
113
+ - Describe single continuous actions
114
+ - Recommended length: 40-60 frames for short actions, 80-120 for walking/running
115
+ """)
116
+
117
+ with gr.Row():
118
+ # Left column - Inputs
119
+ with gr.Column(scale=1):
120
+ gr.Markdown("## 📝 Input")
121
+
122
+ text_input = gr.Textbox(
123
+ label="Text Prompt",
124
+ placeholder="Enter motion description (e.g., 'a person walks forward slowly')",
125
+ lines=3,
126
+ value="a person walks forward",
127
+ )
128
+
129
+ with gr.Row():
130
+ length_slider = gr.Slider(
131
+ minimum=16,
132
+ maximum=196,
133
+ value=100,
134
+ step=1,
135
+ label="Motion Length (frames)",
136
+ info="20 frames = 1 second",
137
+ )
138
+
139
+ generate_btn = gr.Button("🎬 Generate Motion", variant="primary", size="lg")
140
+
141
+ gr.Markdown("### 📚 Example Prompts")
142
+ gr.Examples(
143
+ examples=create_example_prompts(),
144
+ inputs=[text_input, length_slider],
145
+ label=None,
146
+ )
147
+
148
+ # Right column - Outputs
149
+ with gr.Column(scale=1):
150
+ gr.Markdown("## 🎥 Output")
151
+
152
+ info_output = gr.Markdown(
153
+ "Generate a motion to see the results here.",
154
+ elem_classes=["output-info"],
155
+ )
156
+
157
+ video_output = gr.Video(
158
+ label="Generated Motion Video",
159
+ elem_classes=["output-video"],
160
+ autoplay=True,
161
+ show_share_button=True,
162
+ )
163
+
164
+ with gr.Row():
165
+ pt_download = gr.File(label="Download Motion Data (.pt)", visible=False)
166
+
167
+ # Footer
168
+ gr.Markdown(
169
+ dedent("""
170
+ ---
171
+ ### ℹ️ About
172
+
173
+ **Motion Latent Diffusion (MLD)** generates 3D human motion by:
174
+ 1. Encoding text with CLIP
175
+ 2. Generating motion in latent space via diffusion (50 steps)
176
+ 3. Decoding to 3D joint positions (22 joints)
177
+ 4. Visualizing as a 3D skeleton animation
178
+
179
+ **Citation:** Chen et al., "Executing your Commands via Motion Diffusion in Latent Space", CVPR 2023
180
+
181
+ **Repository:** [motion-latent-diffusion](https://github.com/ChenFengYe/motion-latent-diffusion)
182
+ """)
183
+ )
184
+
185
+ # Event handlers
186
+ def generate_and_update(text, length):
187
+ video, info, pt = generate_motion(text, length)
188
+ if pt:
189
+ return video, info, gr.update(value=pt, visible=True)
190
+ return video, info, gr.update(visible=False)
191
+
192
+ generate_btn.click(
193
+ fn=generate_and_update,
194
+ inputs=[text_input, length_slider],
195
+ outputs=[video_output, info_output, pt_download],
196
+ )
197
+
198
+
199
+ demo.launch(
200
+ server_name="0.0.0.0", # Allow external access
201
+ server_port=7860,
202
+ share=False,
203
+ show_error=True,
204
+ )
cli.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MLD Demo CLI - Generate human motion from text using the standalone MLD package.
3
+ """
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+ from datetime import datetime
8
+ import torch
9
+ from textwrap import dedent
10
+ from tqdm import tqdm
11
+
12
+ from motion_latent_diffusion_standalone import MotionLatentDiffusionModel
13
+ from visualize import create_video_from_joints
14
+
15
+
16
+ def parse_args() -> argparse.Namespace:
17
+ """Parse command line arguments"""
18
+ parser = argparse.ArgumentParser(
19
+ description="Generate human motion from text using MLD",
20
+ formatter_class=argparse.RawDescriptionHelpFormatter,
21
+ epilog=dedent("""
22
+ Examples:
23
+ # Basic usage
24
+ python cli.py --text "a person walks forward slowly"
25
+
26
+ # Custom length
27
+ python cli.py --text "jumping jacks" --length 120
28
+
29
+ # Save to specific directory
30
+ python cli.py --text "dancing" --output ./motions/
31
+
32
+ # Skip video generation (faster)
33
+ python cli.py --text "running" --no-video
34
+ """),
35
+ )
36
+
37
+ parser.add_argument(
38
+ "--text",
39
+ type=str,
40
+ required=True,
41
+ help="Text description of the motion to generate",
42
+ )
43
+
44
+ parser.add_argument(
45
+ "--length",
46
+ type=int,
47
+ default=100,
48
+ help="Motion length in frames (default: 100, range: 16-196 for 20fps)",
49
+ )
50
+
51
+ parser.add_argument(
52
+ "--output",
53
+ type=str,
54
+ default="./outputs",
55
+ help="Output directory for generated files (default: ./outputs)",
56
+ )
57
+
58
+ parser.add_argument(
59
+ "--no-video",
60
+ action="store_true",
61
+ help="Skip video generation, only save .pt file",
62
+ )
63
+
64
+ parser.add_argument(
65
+ "--device",
66
+ type=str,
67
+ default="cuda" if torch.cuda.is_available() else "cpu",
68
+ choices=["cuda", "cpu"],
69
+ help="Device to run on (default: cuda if available, else cpu)",
70
+ )
71
+
72
+ return parser.parse_args()
73
+
74
+
75
+ def generate_filename(text: str) -> str:
76
+ """Generate a filename from text and timestamp"""
77
+ # Clean text for filename: remove special characters
78
+ text_clean = "".join(c if c.isalnum() or c.isspace() else "" for c in text)
79
+ text_clean = "_".join(text_clean.split()[:5]) # First 5 words
80
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
81
+ return f"{text_clean}_{timestamp}"
82
+
83
+
84
+ def main() -> None:
85
+ """Main entry point for CLI"""
86
+ args = parse_args()
87
+
88
+ # Validate motion length
89
+ if args.length < 16 or args.length > 196:
90
+ print(f"Warning: Length {args.length} is outside recommended range (16-196)")
91
+ print("Proceeding anyway, but results may be suboptimal.")
92
+
93
+ # Setup output paths
94
+ output_dir = Path(args.output)
95
+ output_dir.mkdir(parents=True, exist_ok=True)
96
+
97
+ # Generate output filenames
98
+ base_name = generate_filename(args.text)
99
+ pt_path = output_dir / f"{base_name}.pt"
100
+ mp4_path = output_dir / f"{base_name}.mp4"
101
+ txt_path = output_dir / f"{base_name}.txt"
102
+
103
+ print("=" * 70)
104
+ print("MLD Text-to-Motion Generator")
105
+ print("=" * 70)
106
+ print(f"Text prompt: {args.text}")
107
+ print(f"Motion length: {args.length} frames ({args.length / 20:.1f}s at 20fps)")
108
+ print(f"Output directory: {output_dir.absolute()}")
109
+ print(f"Device: {args.device}")
110
+ print("=" * 70)
111
+
112
+ # [1/4] Load model from HuggingFace Hub
113
+ print("\n[1/4] Loading model from HuggingFace Hub...")
114
+ print("This may take a minute on first run (downloads ~105MB)...")
115
+ model = MotionLatentDiffusionModel(
116
+ vae_repo_id="blanchon/motion-latent-diffusion-standalone-vae",
117
+ denoiser_repo_id="blanchon/motion-latent-diffusion-standalone-denoiser",
118
+ text_encoder_repo_id="openai/clip-vit-large-patch14",
119
+ ).to(args.device)
120
+
121
+ # [2/4] Generate motion
122
+ print("\n[2/4] Generating motion...")
123
+ print(f"Running diffusion sampling ({model.num_inference_timesteps} steps)...")
124
+
125
+ with tqdm(total=args.length, desc="Generating motion") as pbar:
126
+
127
+ def callback_on_step_end(i: int, latents: torch.Tensor):
128
+ pbar.update(i)
129
+
130
+ # Generate motion (returns PyTorch tensor)
131
+ joints, latent = model.generate(
132
+ args.text,
133
+ args.length,
134
+ return_latent=True,
135
+ callback_on_step_end=callback_on_step_end,
136
+ )
137
+
138
+ print(f"✓ Generated motion: {joints.shape}")
139
+ print(
140
+ f" Shape: ({joints.shape[0]} frames, {joints.shape[1]} joints, {joints.shape[2]} coords)"
141
+ )
142
+
143
+ # [3/4] Save motion file as PyTorch tensor
144
+ print("\n[3/4] Saving files...")
145
+ torch.save(joints, pt_path)
146
+ print(f"✓ Saved motion: {pt_path}")
147
+
148
+ # Save latent representation
149
+ latent_path = output_dir / f"{base_name}.latent.pt"
150
+ torch.save(latent, latent_path)
151
+ print(f"✓ Saved latent: {latent_path}")
152
+
153
+ # Save text prompt for reference
154
+ with open(txt_path, "w") as f:
155
+ f.write(args.text)
156
+ print(f"✓ Saved prompt: {txt_path}")
157
+
158
+ # [4/4] Generate video if requested
159
+ if not args.no_video:
160
+ print("\n[4/4] Generating video visualization...")
161
+ video_path = create_video_from_joints(joints, str(mp4_path), fps=20)
162
+ print(f"✓ Generated video: {video_path}")
163
+ else:
164
+ print("\n[4/4] Skipping video generation (--no-video flag)")
165
+
166
+ # Print summary
167
+ print("\n" + "=" * 70)
168
+ print("✓ Generation complete!")
169
+ print("=" * 70)
170
+ print("Output files:")
171
+ print(f" Motion data: {pt_path}")
172
+ print(f" Latent repr: {latent_path}")
173
+ print(f" Text prompt: {txt_path}")
174
+ if not args.no_video:
175
+ print(f" Video: {mp4_path}")
176
+ print("\nTo visualize the motion later:")
177
+ print(f" python visualize.py {pt_path}")
178
+ print("=" * 70)
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
notebooks/latent_analysis.ipynb ADDED
@@ -0,0 +1,903 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Motion Latent Analysis\n",
8
+ "\n",
9
+ "This notebook demonstrates how to work with motion latent representations from the MLD model:\n",
10
+ "\n",
11
+ "1. **Generate variations** - Create 10 similar \"jump\" motions\n",
12
+ "2. **Compute mean latent** - Average the latent representations\n",
13
+ "3. **Distance computation** - Compare motions using L2 distance\n",
14
+ "4. **Classification** - Distinguish jump from non-jump motions\n"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "markdown",
19
+ "metadata": {},
20
+ "source": [
21
+ "## Setup and Imports\n"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "name": "stderr",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/.venv/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
34
+ " from .autonotebook import tqdm as notebook_tqdm\n"
35
+ ]
36
+ }
37
+ ],
38
+ "source": [
39
+ "import numpy as np\n",
40
+ "import torch\n",
41
+ "from pathlib import Path\n",
42
+ "from standalone_demo import StandaloneConfig, load_model\n",
43
+ "\n",
44
+ "# Configuration\n",
45
+ "OUTPUT_DIR = Path(\"outputs/jump\")\n",
46
+ "NUM_VARIATIONS = 20\n",
47
+ "MOTION_LENGTH = 120 # frames (6 seconds at 20fps)"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {},
53
+ "source": [
54
+ "## Load Model\n",
55
+ "\n",
56
+ "Load the MLD model for motion generation. This will auto-download models if needed.\n"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 2,
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "name": "stdout",
66
+ "output_type": "stream",
67
+ "text": [
68
+ "Loading MLD model...\n",
69
+ "Model initialized on cuda\n",
70
+ "Loading checkpoint from resources/checkpoints/model.ckpt\n",
71
+ "Checkpoint loaded successfully\n",
72
+ "✓ Model loaded successfully\n"
73
+ ]
74
+ }
75
+ ],
76
+ "source": [
77
+ "print(\"Loading MLD model...\")\n",
78
+ "config = StandaloneConfig()\n",
79
+ "config.resolve_paths(Path(\".\"))\n",
80
+ "model = load_model(config)\n",
81
+ "print(\"✓ Model loaded successfully\")"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "markdown",
86
+ "metadata": {},
87
+ "source": [
88
+ "## Step 1: Generate jump Variations\n",
89
+ "\n",
90
+ "Generate 10 variations of \"jump\" motions using slightly different prompts.\n",
91
+ "Each generation saves:\n",
92
+ "- `.npy` - 3D joint positions\n",
93
+ "- `.latent.pt` - Latent representation\n"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "metadata": {},
100
+ "outputs": [
101
+ {
102
+ "name": "stdout",
103
+ "output_type": "stream",
104
+ "text": [
105
+ "Generating 20 jump variations...\n",
106
+ "\n",
107
+ "[1/20] a person does a jump\n"
108
+ ]
109
+ },
110
+ {
111
+ "name": "stderr",
112
+ "output_type": "stream",
113
+ "text": [
114
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
115
+ " lengths = torch.tensor(lengths, device=device)\n"
116
+ ]
117
+ },
118
+ {
119
+ "name": "stdout",
120
+ "output_type": "stream",
121
+ "text": [
122
+ " ✓ Saved jump_var_00\n",
123
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
124
+ "[2/20] someone performs a jump\n"
125
+ ]
126
+ },
127
+ {
128
+ "name": "stderr",
129
+ "output_type": "stream",
130
+ "text": [
131
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
132
+ " lengths = torch.tensor(lengths, device=device)\n"
133
+ ]
134
+ },
135
+ {
136
+ "name": "stdout",
137
+ "output_type": "stream",
138
+ "text": [
139
+ " ✓ Saved jump_var_01\n",
140
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
141
+ "[3/20] a person jumps in the air\n"
142
+ ]
143
+ },
144
+ {
145
+ "name": "stderr",
146
+ "output_type": "stream",
147
+ "text": [
148
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
149
+ " lengths = torch.tensor(lengths, device=device)\n"
150
+ ]
151
+ },
152
+ {
153
+ "name": "stdout",
154
+ "output_type": "stream",
155
+ "text": [
156
+ " ✓ Saved jump_var_02\n",
157
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
158
+ "[4/20] doing a jump\n"
159
+ ]
160
+ },
161
+ {
162
+ "name": "stderr",
163
+ "output_type": "stream",
164
+ "text": [
165
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
166
+ " lengths = torch.tensor(lengths, device=device)\n"
167
+ ]
168
+ },
169
+ {
170
+ "name": "stdout",
171
+ "output_type": "stream",
172
+ "text": [
173
+ " ✓ Saved jump_var_03\n",
174
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
175
+ "[5/20] performing a jump\n"
176
+ ]
177
+ },
178
+ {
179
+ "name": "stderr",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
183
+ " lengths = torch.tensor(lengths, device=device)\n"
184
+ ]
185
+ },
186
+ {
187
+ "name": "stdout",
188
+ "output_type": "stream",
189
+ "text": [
190
+ " ✓ Saved jump_var_04\n",
191
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
192
+ "[6/20] a person does a jump\n"
193
+ ]
194
+ },
195
+ {
196
+ "name": "stderr",
197
+ "output_type": "stream",
198
+ "text": [
199
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
200
+ " lengths = torch.tensor(lengths, device=device)\n"
201
+ ]
202
+ },
203
+ {
204
+ "name": "stdout",
205
+ "output_type": "stream",
206
+ "text": [
207
+ " ✓ Saved jump_var_05\n",
208
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
209
+ "[7/20] someone jumps backward\n"
210
+ ]
211
+ },
212
+ {
213
+ "name": "stderr",
214
+ "output_type": "stream",
215
+ "text": [
216
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
217
+ " lengths = torch.tensor(lengths, device=device)\n"
218
+ ]
219
+ },
220
+ {
221
+ "name": "stdout",
222
+ "output_type": "stream",
223
+ "text": [
224
+ " ✓ Saved jump_var_06\n",
225
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
226
+ "[8/20] a person executes a jump\n"
227
+ ]
228
+ },
229
+ {
230
+ "name": "stderr",
231
+ "output_type": "stream",
232
+ "text": [
233
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
234
+ " lengths = torch.tensor(lengths, device=device)\n"
235
+ ]
236
+ },
237
+ {
238
+ "name": "stdout",
239
+ "output_type": "stream",
240
+ "text": [
241
+ " ✓ Saved jump_var_07\n",
242
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
243
+ "[9/20] doing an acrobatic jump\n"
244
+ ]
245
+ },
246
+ {
247
+ "name": "stderr",
248
+ "output_type": "stream",
249
+ "text": [
250
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
251
+ " lengths = torch.tensor(lengths, device=device)\n"
252
+ ]
253
+ },
254
+ {
255
+ "name": "stdout",
256
+ "output_type": "stream",
257
+ "text": [
258
+ " ✓ Saved jump_var_08\n",
259
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
260
+ "[10/20] a person jumps forward\n"
261
+ ]
262
+ },
263
+ {
264
+ "name": "stderr",
265
+ "output_type": "stream",
266
+ "text": [
267
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
268
+ " lengths = torch.tensor(lengths, device=device)\n"
269
+ ]
270
+ },
271
+ {
272
+ "name": "stdout",
273
+ "output_type": "stream",
274
+ "text": [
275
+ " ✓ Saved jump_var_09\n",
276
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
277
+ "[11/20] a person does a jump\n"
278
+ ]
279
+ },
280
+ {
281
+ "name": "stderr",
282
+ "output_type": "stream",
283
+ "text": [
284
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
285
+ " lengths = torch.tensor(lengths, device=device)\n"
286
+ ]
287
+ },
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ " ✓ Saved jump_var_10\n",
293
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
294
+ "[12/20] someone performs a jump\n"
295
+ ]
296
+ },
297
+ {
298
+ "name": "stderr",
299
+ "output_type": "stream",
300
+ "text": [
301
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
302
+ " lengths = torch.tensor(lengths, device=device)\n"
303
+ ]
304
+ },
305
+ {
306
+ "name": "stdout",
307
+ "output_type": "stream",
308
+ "text": [
309
+ " ✓ Saved jump_var_11\n",
310
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
311
+ "[13/20] a person jumps in the air\n"
312
+ ]
313
+ },
314
+ {
315
+ "name": "stderr",
316
+ "output_type": "stream",
317
+ "text": [
318
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
319
+ " lengths = torch.tensor(lengths, device=device)\n"
320
+ ]
321
+ },
322
+ {
323
+ "name": "stdout",
324
+ "output_type": "stream",
325
+ "text": [
326
+ " ✓ Saved jump_var_12\n",
327
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
328
+ "[14/20] doing a jump\n"
329
+ ]
330
+ },
331
+ {
332
+ "name": "stderr",
333
+ "output_type": "stream",
334
+ "text": [
335
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
336
+ " lengths = torch.tensor(lengths, device=device)\n"
337
+ ]
338
+ },
339
+ {
340
+ "name": "stdout",
341
+ "output_type": "stream",
342
+ "text": [
343
+ " ✓ Saved jump_var_13\n",
344
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
345
+ "[15/20] performing a jump\n"
346
+ ]
347
+ },
348
+ {
349
+ "name": "stderr",
350
+ "output_type": "stream",
351
+ "text": [
352
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
353
+ " lengths = torch.tensor(lengths, device=device)\n"
354
+ ]
355
+ },
356
+ {
357
+ "name": "stdout",
358
+ "output_type": "stream",
359
+ "text": [
360
+ " ✓ Saved jump_var_14\n",
361
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
362
+ "[16/20] a person does a jump\n"
363
+ ]
364
+ },
365
+ {
366
+ "name": "stderr",
367
+ "output_type": "stream",
368
+ "text": [
369
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
370
+ " lengths = torch.tensor(lengths, device=device)\n"
371
+ ]
372
+ },
373
+ {
374
+ "name": "stdout",
375
+ "output_type": "stream",
376
+ "text": [
377
+ " ✓ Saved jump_var_15\n",
378
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
379
+ "[17/20] someone jumps backward\n"
380
+ ]
381
+ },
382
+ {
383
+ "name": "stderr",
384
+ "output_type": "stream",
385
+ "text": [
386
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
387
+ " lengths = torch.tensor(lengths, device=device)\n"
388
+ ]
389
+ },
390
+ {
391
+ "name": "stdout",
392
+ "output_type": "stream",
393
+ "text": [
394
+ " ✓ Saved jump_var_16\n",
395
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
396
+ "[18/20] a person executes a jump\n"
397
+ ]
398
+ },
399
+ {
400
+ "name": "stderr",
401
+ "output_type": "stream",
402
+ "text": [
403
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
404
+ " lengths = torch.tensor(lengths, device=device)\n"
405
+ ]
406
+ },
407
+ {
408
+ "name": "stdout",
409
+ "output_type": "stream",
410
+ "text": [
411
+ " ✓ Saved jump_var_17\n",
412
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
413
+ "[19/20] doing an acrobatic jump\n"
414
+ ]
415
+ },
416
+ {
417
+ "name": "stderr",
418
+ "output_type": "stream",
419
+ "text": [
420
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
421
+ " lengths = torch.tensor(lengths, device=device)\n"
422
+ ]
423
+ },
424
+ {
425
+ "name": "stdout",
426
+ "output_type": "stream",
427
+ "text": [
428
+ " ✓ Saved jump_var_18\n",
429
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
430
+ "[20/20] a person jumps forward\n"
431
+ ]
432
+ },
433
+ {
434
+ "name": "stderr",
435
+ "output_type": "stream",
436
+ "text": [
437
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
438
+ " lengths = torch.tensor(lengths, device=device)\n"
439
+ ]
440
+ },
441
+ {
442
+ "name": "stdout",
443
+ "output_type": "stream",
444
+ "text": [
445
+ " ✓ Saved jump_var_19\n",
446
+ " Joints: (120, 22, 3), Latent: torch.Size([1, 1, 256])\n",
447
+ "\n",
448
+ "✓ Generated 20 jump variations\n"
449
+ ]
450
+ }
451
+ ],
452
+ "source": [
453
+ "import shutil\n",
454
+ "\n",
455
+ "# Create output directory\n",
456
+ "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
457
+ "\n",
458
+ "# Define prompt variations\n",
459
+ "jump_prompts = [\n",
460
+ " \"a person does a jump\",\n",
461
+ " \"someone performs a jump\",\n",
462
+ " \"a person jumps in the air\",\n",
463
+ " \"doing a jump\",\n",
464
+ " \"performing a jump\",\n",
465
+ " \"a person does a jump\",\n",
466
+ " \"someone jumps backward\",\n",
467
+ " \"a person executes a jump\",\n",
468
+ " \"doing an acrobatic jump\",\n",
469
+ " \"a person jumps forward\",\n",
470
+ " \"a person does a jump\",\n",
471
+ " \"someone performs a jump\",\n",
472
+ " \"a person jumps in the air\",\n",
473
+ " \"doing a jump\",\n",
474
+ " \"performing a jump\",\n",
475
+ " \"a person does a jump\",\n",
476
+ " \"someone jumps backward\",\n",
477
+ " \"a person executes a jump\",\n",
478
+ " \"doing an acrobatic jump\",\n",
479
+ " \"a person jumps forward\",\n",
480
+ " \"a person does a jump\",\n",
481
+ " \"someone performs a jump\",\n",
482
+ " \"a person jumps in the air\",\n",
483
+ " \"doing a jump\",\n",
484
+ " \"performing a jump\",\n",
485
+ " \"a person does a jump\",\n",
486
+ " \"someone jumps backward\",\n",
487
+ " \"a person executes a jump\",\n",
488
+ " \"doing an acrobatic jump\",\n",
489
+ " \"a person jumps forward\",\n",
490
+ "]\n",
491
+ "\n",
492
+ "print(f\"Generating {NUM_VARIATIONS} jump variations...\\n\")\n",
493
+ "\n",
494
+ "latent_paths = []\n",
495
+ "\n",
496
+ "for i, prompt in enumerate(jump_prompts[:NUM_VARIATIONS]):\n",
497
+ " print(f\"[{i + 1}/{NUM_VARIATIONS}] {prompt}\")\n",
498
+ "\n",
499
+ " # Generate motion with latent\n",
500
+ " (joints, latent, video_path) = model.generate(\n",
501
+ " prompt, MOTION_LENGTH, return_latent=True, create_video=True\n",
502
+ " )\n",
503
+ "\n",
504
+ " # Save files\n",
505
+ " base_name = f\"jump_var_{i:02d}\"\n",
506
+ " npy_path = OUTPUT_DIR / f\"{base_name}.npy\"\n",
507
+ " latent_path = OUTPUT_DIR / f\"{base_name}.latent.pt\"\n",
508
+ "\n",
509
+ " np.save(npy_path, joints)\n",
510
+ " torch.save(latent, latent_path)\n",
511
+ " latent_paths.append(latent_path)\n",
512
+ "\n",
513
+ " # Save video\n",
514
+ " video_path_target = OUTPUT_DIR / f\"{base_name}.mp4\"\n",
515
+ " shutil.copy(video_path, video_path_target)\n",
516
+ "\n",
517
+ " print(f\" ✓ Saved {base_name}\")\n",
518
+ " print(f\" Joints: {joints.shape}, Latent: {latent.shape}\")\n",
519
+ "\n",
520
+ "print(f\"\\n✓ Generated {len(latent_paths)} jump variations\")"
521
+ ]
522
+ },
523
+ {
524
+ "cell_type": "markdown",
525
+ "metadata": {},
526
+ "source": [
527
+ "## Step 2: Compute Mean Latent\n",
528
+ "\n",
529
+ "Average all flip latents to create a \"prototype\" flip representation.\n"
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": 4,
535
+ "metadata": {},
536
+ "outputs": [
537
+ {
538
+ "name": "stdout",
539
+ "output_type": "stream",
540
+ "text": [
541
+ "Computing mean latent from 20 samples...\n",
542
+ "✓ Mean latent shape: torch.Size([1, 1, 256])\n",
543
+ "✓ Saved to: outputs/jump/jump_mean.latent.pt\n"
544
+ ]
545
+ }
546
+ ],
547
+ "source": [
548
+ "print(f\"Computing mean latent from {len(latent_paths)} samples...\")\n",
549
+ "\n",
550
+ "# Load all latents\n",
551
+ "latents = [torch.load(path) for path in latent_paths]\n",
552
+ "\n",
553
+ "# Stack and compute mean\n",
554
+ "latents_stacked = torch.stack(latents)\n",
555
+ "mean_latent = latents_stacked.mean(dim=0)\n",
556
+ "\n",
557
+ "# Save mean latent\n",
558
+ "mean_latent_path = OUTPUT_DIR / \"jump_mean.latent.pt\"\n",
559
+ "torch.save(mean_latent, mean_latent_path)\n",
560
+ "\n",
561
+ "print(f\"✓ Mean latent shape: {mean_latent.shape}\")\n",
562
+ "print(f\"✓ Saved to: {mean_latent_path}\")"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "markdown",
567
+ "metadata": {},
568
+ "source": [
569
+ "## Step 3: Define Distance Function\n",
570
+ "\n",
571
+ "L2 distance measures similarity between latent representations.\n"
572
+ ]
573
+ },
574
+ {
575
+ "cell_type": "code",
576
+ "execution_count": 5,
577
+ "metadata": {},
578
+ "outputs": [
579
+ {
580
+ "name": "stdout",
581
+ "output_type": "stream",
582
+ "text": [
583
+ "✓ Distance function defined\n"
584
+ ]
585
+ }
586
+ ],
587
+ "source": [
588
+ "def compute_latent_distance(latent1, latent2):\n",
589
+ " \"\"\"\n",
590
+ " Compute L2 (Euclidean) distance between two latent representations.\n",
591
+ "\n",
592
+ " Args:\n",
593
+ " latent1: First latent tensor or path\n",
594
+ " latent2: Second latent tensor or path\n",
595
+ "\n",
596
+ " Returns:\n",
597
+ " L2 distance (float)\n",
598
+ " \"\"\"\n",
599
+ " # Load if paths provided\n",
600
+ " if isinstance(latent1, (str, Path)):\n",
601
+ " latent1 = torch.load(latent1)\n",
602
+ " if isinstance(latent2, (str, Path)):\n",
603
+ " latent2 = torch.load(latent2)\n",
604
+ "\n",
605
+ " # Compute L2 norm of difference\n",
606
+ " distance = torch.norm(latent1 - latent2, p=2).item()\n",
607
+ "\n",
608
+ " return distance\n",
609
+ "\n",
610
+ "\n",
611
+ "print(\"✓ Distance function defined\")"
612
+ ]
613
+ },
614
+ {
615
+ "cell_type": "markdown",
616
+ "metadata": {},
617
+ "source": [
618
+ "## Step 4: Generate Test Motions\n",
619
+ "\n",
620
+ "Generate:\n",
621
+ "- A flip motion (should be close to mean)\n",
622
+ "- A walk motion (should be far from mean)\n"
623
+ ]
624
+ },
625
+ {
626
+ "cell_type": "code",
627
+ "execution_count": 6,
628
+ "metadata": {},
629
+ "outputs": [
630
+ {
631
+ "name": "stdout",
632
+ "output_type": "stream",
633
+ "text": [
634
+ "Generating test motions...\n",
635
+ "\n",
636
+ "1. Generating jump-like motion...\n"
637
+ ]
638
+ },
639
+ {
640
+ "name": "stderr",
641
+ "output_type": "stream",
642
+ "text": [
643
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
644
+ " lengths = torch.tensor(lengths, device=device)\n"
645
+ ]
646
+ },
647
+ {
648
+ "name": "stdout",
649
+ "output_type": "stream",
650
+ "text": [
651
+ " ✓ Saved test jump motion\n",
652
+ "\n",
653
+ "2. Generating non-jump motion (walking)...\n"
654
+ ]
655
+ },
656
+ {
657
+ "name": "stderr",
658
+ "output_type": "stream",
659
+ "text": [
660
+ "/workspace/ai-toolkit/motion-latent-diffusion/standalone_demo/src/standalone_demo/models/utils.py:23: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
661
+ " lengths = torch.tensor(lengths, device=device)\n"
662
+ ]
663
+ },
664
+ {
665
+ "name": "stdout",
666
+ "output_type": "stream",
667
+ "text": [
668
+ " ✓ Saved test walk motion\n"
669
+ ]
670
+ }
671
+ ],
672
+ "source": [
673
+ "print(\"Generating test motions...\\n\")\n",
674
+ "\n",
675
+ "# Test 1: jump-like motion\n",
676
+ "print(\"1. Generating jump-like motion...\")\n",
677
+ "joints_jump, latent_jump, video_path_jump = model.generate(\n",
678
+ " \"a person does a jump\", MOTION_LENGTH, return_latent=True, create_video=True\n",
679
+ ")\n",
680
+ "jump_latent_path = OUTPUT_DIR / \"test_jump.latent.pt\"\n",
681
+ "torch.save(latent_jump, jump_latent_path)\n",
682
+ "np.save(OUTPUT_DIR / \"test_jump.npy\", joints_jump)\n",
683
+ "\n",
684
+ "video_path_target = OUTPUT_DIR / \"test_jump.mp4\"\n",
685
+ "shutil.copy(video_path_jump, video_path_target)\n",
686
+ "\n",
687
+ "print(f\" ✓ Saved test jump motion\")\n",
688
+ "\n",
689
+ "# Test 2: Non-jump motion (walking)\n",
690
+ "print(\"\\n2. Generating non-jump motion (walking)...\")\n",
691
+ "joints_walk, latent_walk, video_path_walk = model.generate(\n",
692
+ " \"a person walks forward\", MOTION_LENGTH, return_latent=True, create_video=True\n",
693
+ ")\n",
694
+ "walk_latent_path = OUTPUT_DIR / \"test_walk.latent.pt\"\n",
695
+ "torch.save(latent_walk, walk_latent_path)\n",
696
+ "np.save(OUTPUT_DIR / \"test_walk.npy\", joints_walk)\n",
697
+ "\n",
698
+ "video_path_target = OUTPUT_DIR / \"test_walk.mp4\"\n",
699
+ "shutil.copy(video_path_walk, video_path_target)\n",
700
+ "\n",
701
+ "print(f\" ✓ Saved test walk motion\")"
702
+ ]
703
+ },
704
+ {
705
+ "cell_type": "markdown",
706
+ "metadata": {},
707
+ "source": [
708
+ "## Step 5: Compare Distances\n",
709
+ "\n",
710
+ "Measure how close each test motion is to the mean jump latent.\n",
711
+ "\n",
712
+ "**Hypothesis**: jump motion should have smaller distance than walk motion.\n"
713
+ ]
714
+ },
715
+ {
716
+ "cell_type": "code",
717
+ "execution_count": 7,
718
+ "metadata": {},
719
+ "outputs": [
720
+ {
721
+ "name": "stdout",
722
+ "output_type": "stream",
723
+ "text": [
724
+ "Computing distances to mean jump latent...\n",
725
+ "\n",
726
+ "============================================================\n",
727
+ "📊 RESULTS\n",
728
+ "============================================================\n",
729
+ "Distance (jump → mean jump): 12.6496\n",
730
+ "Distance (walk → mean jump): 42.3448\n",
731
+ "\n",
732
+ "Ratio (walk/jump): 3.35x\n",
733
+ "============================================================\n",
734
+ "\n",
735
+ "✅ SUCCESS: jump is closer to mean jump latent!\n",
736
+ " The model can distinguish jump from non-jump motions.\n"
737
+ ]
738
+ }
739
+ ],
740
+ "source": [
741
+ "print(\"Computing distances to mean jump latent...\\n\")\n",
742
+ "\n",
743
+ "# Distance: Test jump → Mean jump\n",
744
+ "dist_jump_to_mean = compute_latent_distance(latent_jump, mean_latent)\n",
745
+ "\n",
746
+ "# Distance: Test walk → Mean jump\n",
747
+ "dist_walk_to_mean = compute_latent_distance(latent_walk, mean_latent)\n",
748
+ "\n",
749
+ "# Display results\n",
750
+ "print(\"=\" * 60)\n",
751
+ "print(\"📊 RESULTS\")\n",
752
+ "print(\"=\" * 60)\n",
753
+ "print(f\"Distance (jump → mean jump): {dist_jump_to_mean:.4f}\")\n",
754
+ "print(f\"Distance (walk → mean jump): {dist_walk_to_mean:.4f}\")\n",
755
+ "print(f\"\\nRatio (walk/jump): {dist_walk_to_mean / dist_jump_to_mean:.2f}x\")\n",
756
+ "print(\"=\" * 60)\n",
757
+ "\n",
758
+ "if dist_jump_to_mean < dist_walk_to_mean:\n",
759
+ " print(\"\\n✅ SUCCESS: jump is closer to mean jump latent!\")\n",
760
+ " print(f\" The model can distinguish jump from non-jump motions.\")\n",
761
+ "else:\n",
762
+ " print(\"\\n⚠️ UNEXPECTED: Walk is closer to mean jump latent.\")\n",
763
+ " print(f\" This suggests the latent space may not capture this distinction.\")"
764
+ ]
765
+ },
766
+ {
767
+ "cell_type": "markdown",
768
+ "metadata": {},
769
+ "source": [
770
+ "## Bonus: Analyze Individual Variation Distances\n",
771
+ "\n",
772
+ "See how much each jump variation differs from the mean.\n"
773
+ ]
774
+ },
775
+ {
776
+ "cell_type": "code",
777
+ "execution_count": 8,
778
+ "metadata": {},
779
+ "outputs": [
780
+ {
781
+ "name": "stdout",
782
+ "output_type": "stream",
783
+ "text": [
784
+ "Analyzing variation distances...\n",
785
+ "\n",
786
+ " Variation 00: 17.7083\n",
787
+ " Variation 01: 23.6372\n",
788
+ " Variation 02: 23.7708\n",
789
+ " Variation 03: 27.0579\n",
790
+ " Variation 04: 17.2911\n",
791
+ " Variation 05: 18.6115\n",
792
+ " Variation 06: 43.8279\n",
793
+ " Variation 07: 29.0473\n",
794
+ " Variation 08: 23.5446\n",
795
+ " Variation 09: 20.4132\n",
796
+ " Variation 10: 14.3313\n",
797
+ " Variation 11: 19.8556\n",
798
+ " Variation 12: 31.8104\n",
799
+ " Variation 13: 20.7619\n",
800
+ " Variation 14: 22.4498\n",
801
+ " Variation 15: 34.5026\n",
802
+ " Variation 16: 26.5776\n",
803
+ " Variation 17: 38.9580\n",
804
+ " Variation 18: 28.6006\n",
805
+ " Variation 19: 24.1094\n",
806
+ "\n",
807
+ "Variation statistics:\n",
808
+ " Mean distance: 25.3433\n",
809
+ " Std deviation: 7.2979\n",
810
+ "\n",
811
+ "Comparison:\n",
812
+ " Test jump: 12.6496 (0.50x mean variation)\n",
813
+ " Test walk: 42.3448 (1.67x mean variation)\n"
814
+ ]
815
+ }
816
+ ],
817
+ "source": [
818
+ "print(\"Analyzing variation distances...\\n\")\n",
819
+ "\n",
820
+ "variation_distances = []\n",
821
+ "for i, latent_path in enumerate(latent_paths):\n",
822
+ " dist = compute_latent_distance(latent_path, mean_latent)\n",
823
+ " variation_distances.append(dist)\n",
824
+ " print(f\" Variation {i:02d}: {dist:.4f}\")\n",
825
+ "\n",
826
+ "avg_variation = np.mean(variation_distances)\n",
827
+ "std_variation = np.std(variation_distances)\n",
828
+ "\n",
829
+ "print(f\"\\nVariation statistics:\")\n",
830
+ "print(f\" Mean distance: {avg_variation:.4f}\")\n",
831
+ "print(f\" Std deviation: {std_variation:.4f}\")\n",
832
+ "print(f\"\\nComparison:\")\n",
833
+ "print(\n",
834
+ " f\" Test jump: {dist_jump_to_mean:.4f} ({dist_jump_to_mean / avg_variation:.2f}x mean variation)\"\n",
835
+ ")\n",
836
+ "print(\n",
837
+ " f\" Test walk: {dist_walk_to_mean:.4f} ({dist_walk_to_mean / avg_variation:.2f}x mean variation)\"\n",
838
+ ")"
839
+ ]
840
+ },
841
+ {
842
+ "cell_type": "markdown",
843
+ "metadata": {},
844
+ "source": [
845
+ "## Summary\n",
846
+ "\n",
847
+ "### 📁 Files Created\n",
848
+ "\n",
849
+ "In `outputs/jump/`:\n",
850
+ "- `jump_var_00` to `jump_var_09` (.npy + .latent.pt) - 10 jump variations\n",
851
+ "- `jump_mean.latent.pt` - Mean latent of all variations ⭐\n",
852
+ "- `test_jump` (.npy + .latent.pt) - Test jump motion\n",
853
+ "- `test_walk` (.npy + .latent.pt) - Test walk motion\n",
854
+ "\n",
855
+ "**Total**: 24 files (10 variations + 2 tests + 1 mean + videos)\n",
856
+ "\n",
857
+ "### 🔬 Key Findings\n",
858
+ "\n",
859
+ "1. **Latent space clustering**: Similar motions (jumps) have similar latent representations\n",
860
+ "2. **Distance metric**: L2 distance effectively distinguishes motion types\n",
861
+ "3. **Mean latent**: Averaging latents creates a useful prototype representation\n",
862
+ "\n",
863
+ "### 🎯 Applications\n",
864
+ "\n",
865
+ "- **Motion classification**: Identify motion types (jump, walk, jump, etc.)\n",
866
+ "- **Motion retrieval**: Find similar motions in a database\n",
867
+ "- **Quality control**: Detect outlier/corrupted generations\n",
868
+ "- **Interpolation**: Blend between different motions\n",
869
+ "- **Style transfer**: Map motions to similar but different styles\n",
870
+ "- **Few-shot learning**: Create classifiers from few examples\n",
871
+ "\n",
872
+ "### 💡 Next Steps\n",
873
+ "\n",
874
+ "Try this analysis with other motion types:\n",
875
+ "- Jumps, spins, kicks, dances\n",
876
+ "- Compare multiple motion classes\n",
877
+ "- Build a motion classifier\n",
878
+ "- Create a motion search engine\n"
879
+ ]
880
+ }
881
+ ],
882
+ "metadata": {
883
+ "kernelspec": {
884
+ "display_name": ".venv",
885
+ "language": "python",
886
+ "name": "python3"
887
+ },
888
+ "language_info": {
889
+ "codemirror_mode": {
890
+ "name": "ipython",
891
+ "version": 3
892
+ },
893
+ "file_extension": ".py",
894
+ "mimetype": "text/x-python",
895
+ "name": "python",
896
+ "nbconvert_exporter": "python",
897
+ "pygments_lexer": "ipython3",
898
+ "version": "3.13.7"
899
+ }
900
+ },
901
+ "nbformat": 4,
902
+ "nbformat_minor": 2
903
+ }
pyproject.toml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "motion-latent-diffusion-standalone-demo"
3
+ version = "0.1.0"
4
+ description = "Demo applications for MLD Text-to-Motion Generator - CLI and Gradio interface"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "gradio==5.49.1",
9
+ "matplotlib>=3.10.7",
10
+ "numpy>=2.3.4",
11
+ "torch>=2.9.0",
12
+ "tqdm>=4.67.1",
13
+ "spaces>=0.42.1",
14
+ "motion-latent-diffusion-standalone",
15
+ ]
16
+
17
+ [dependency-groups]
18
+ dev = ["ipykernel>=7.0.1"]
19
+
20
+ [tool.uv.sources]
21
+ motion-latent-diffusion-standalone = { git = "https://github.com/julien-blanchon/minimal-motion-generation" }
requirements.txt ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv pip compile pyproject.toml -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ annotated-types==0.7.0
6
+ # via pydantic
7
+ anyio==4.11.0
8
+ # via
9
+ # gradio
10
+ # httpx
11
+ # starlette
12
+ brotli==1.1.0
13
+ # via gradio
14
+ certifi==2025.10.5
15
+ # via
16
+ # httpcore
17
+ # httpx
18
+ # requests
19
+ charset-normalizer==3.4.4
20
+ # via requests
21
+ click==8.3.0
22
+ # via
23
+ # typer
24
+ # uvicorn
25
+ contourpy==1.3.3
26
+ # via matplotlib
27
+ cycler==0.12.1
28
+ # via matplotlib
29
+ diffusers==0.35.2
30
+ # via motion-latent-diffusion-standalone
31
+ fastapi==0.119.0
32
+ # via gradio
33
+ ffmpy==0.6.3
34
+ # via gradio
35
+ filelock==3.20.0
36
+ # via
37
+ # diffusers
38
+ # huggingface-hub
39
+ # torch
40
+ # transformers
41
+ fonttools==4.60.1
42
+ # via matplotlib
43
+ fsspec==2025.9.0
44
+ # via
45
+ # gradio-client
46
+ # huggingface-hub
47
+ # torch
48
+ gradio==5.49.1
49
+ # via
50
+ # motion-latent-diffusion-standalone-demo (pyproject.toml)
51
+ # spaces
52
+ gradio-client==1.13.3
53
+ # via gradio
54
+ groovy==0.1.2
55
+ # via gradio
56
+ h11==0.16.0
57
+ # via
58
+ # httpcore
59
+ # uvicorn
60
+ hf-xet==1.1.10
61
+ # via huggingface-hub
62
+ httpcore==1.0.9
63
+ # via httpx
64
+ httpx==0.28.1
65
+ # via
66
+ # gradio
67
+ # gradio-client
68
+ # safehttpx
69
+ # spaces
70
+ huggingface-hub==0.35.3
71
+ # via
72
+ # diffusers
73
+ # gradio
74
+ # gradio-client
75
+ # motion-latent-diffusion-standalone
76
+ # tokenizers
77
+ # transformers
78
+ idna==3.11
79
+ # via
80
+ # anyio
81
+ # httpx
82
+ # requests
83
+ importlib-metadata==8.7.0
84
+ # via diffusers
85
+ jinja2==3.1.6
86
+ # via
87
+ # gradio
88
+ # torch
89
+ kiwisolver==1.4.9
90
+ # via matplotlib
91
+ markdown-it-py==4.0.0
92
+ # via rich
93
+ markupsafe==3.0.3
94
+ # via
95
+ # gradio
96
+ # jinja2
97
+ matplotlib==3.10.7
98
+ # via motion-latent-diffusion-standalone-demo (pyproject.toml)
99
+ mdurl==0.1.2
100
+ # via markdown-it-py
101
+ motion-latent-diffusion-standalone @ git+https://github.com/julien-blanchon/minimal-motion-generation@33192ac1b86149a521b636ab8e5e26e3851f079e
102
+ # via motion-latent-diffusion-standalone-demo (pyproject.toml)
103
+ mpmath==1.3.0
104
+ # via sympy
105
+ networkx==3.5
106
+ # via torch
107
+ numpy==2.3.4
108
+ # via
109
+ # motion-latent-diffusion-standalone-demo (pyproject.toml)
110
+ # contourpy
111
+ # diffusers
112
+ # gradio
113
+ # matplotlib
114
+ # pandas
115
+ # transformers
116
+ nvidia-cublas-cu12==12.8.4.1
117
+ # via
118
+ # nvidia-cudnn-cu12
119
+ # nvidia-cusolver-cu12
120
+ # torch
121
+ nvidia-cuda-cupti-cu12==12.8.90
122
+ # via torch
123
+ nvidia-cuda-nvrtc-cu12==12.8.93
124
+ # via torch
125
+ nvidia-cuda-runtime-cu12==12.8.90
126
+ # via torch
127
+ nvidia-cudnn-cu12==9.10.2.21
128
+ # via torch
129
+ nvidia-cufft-cu12==11.3.3.83
130
+ # via torch
131
+ nvidia-cufile-cu12==1.13.1.3
132
+ # via torch
133
+ nvidia-curand-cu12==10.3.9.90
134
+ # via torch
135
+ nvidia-cusolver-cu12==11.7.3.90
136
+ # via torch
137
+ nvidia-cusparse-cu12==12.5.8.93
138
+ # via
139
+ # nvidia-cusolver-cu12
140
+ # torch
141
+ nvidia-cusparselt-cu12==0.7.1
142
+ # via torch
143
+ nvidia-nccl-cu12==2.27.5
144
+ # via torch
145
+ nvidia-nvjitlink-cu12==12.8.93
146
+ # via
147
+ # nvidia-cufft-cu12
148
+ # nvidia-cusolver-cu12
149
+ # nvidia-cusparse-cu12
150
+ # torch
151
+ nvidia-nvshmem-cu12==3.3.20
152
+ # via torch
153
+ nvidia-nvtx-cu12==12.8.90
154
+ # via torch
155
+ orjson==3.11.3
156
+ # via gradio
157
+ packaging==25.0
158
+ # via
159
+ # gradio
160
+ # gradio-client
161
+ # huggingface-hub
162
+ # matplotlib
163
+ # spaces
164
+ # transformers
165
+ pandas==2.3.3
166
+ # via gradio
167
+ pillow==11.3.0
168
+ # via
169
+ # diffusers
170
+ # gradio
171
+ # matplotlib
172
+ psutil==5.9.8
173
+ # via spaces
174
+ pydantic==2.11.10
175
+ # via
176
+ # fastapi
177
+ # gradio
178
+ # spaces
179
+ pydantic-core==2.33.2
180
+ # via pydantic
181
+ pydub==0.25.1
182
+ # via gradio
183
+ pygments==2.19.2
184
+ # via rich
185
+ pyparsing==3.2.5
186
+ # via matplotlib
187
+ python-dateutil==2.9.0.post0
188
+ # via
189
+ # matplotlib
190
+ # pandas
191
+ python-multipart==0.0.20
192
+ # via gradio
193
+ pytz==2025.2
194
+ # via pandas
195
+ pyyaml==6.0.3
196
+ # via
197
+ # gradio
198
+ # huggingface-hub
199
+ # transformers
200
+ regex==2025.9.18
201
+ # via
202
+ # diffusers
203
+ # transformers
204
+ requests==2.32.5
205
+ # via
206
+ # diffusers
207
+ # huggingface-hub
208
+ # spaces
209
+ # transformers
210
+ rich==14.2.0
211
+ # via typer
212
+ ruff==0.14.1
213
+ # via gradio
214
+ safehttpx==0.1.6
215
+ # via gradio
216
+ safetensors==0.6.2
217
+ # via
218
+ # diffusers
219
+ # transformers
220
+ semantic-version==2.10.0
221
+ # via gradio
222
+ shellingham==1.5.4
223
+ # via typer
224
+ six==1.17.0
225
+ # via python-dateutil
226
+ sniffio==1.3.1
227
+ # via anyio
228
+ spaces==0.42.1
229
+ # via motion-latent-diffusion-standalone-demo (pyproject.toml)
230
+ starlette==0.48.0
231
+ # via
232
+ # fastapi
233
+ # gradio
234
+ sympy==1.14.0
235
+ # via torch
236
+ tokenizers==0.22.1
237
+ # via transformers
238
+ tomlkit==0.13.3
239
+ # via gradio
240
+ torch==2.9.0
241
+ # via
242
+ # motion-latent-diffusion-standalone-demo (pyproject.toml)
243
+ # motion-latent-diffusion-standalone
244
+ tqdm==4.67.1
245
+ # via
246
+ # motion-latent-diffusion-standalone-demo (pyproject.toml)
247
+ # huggingface-hub
248
+ # transformers
249
+ transformers==4.57.1
250
+ # via motion-latent-diffusion-standalone
251
+ triton==3.5.0
252
+ # via torch
253
+ typer==0.19.2
254
+ # via gradio
255
+ typing-extensions==4.15.0
256
+ # via
257
+ # anyio
258
+ # fastapi
259
+ # gradio
260
+ # gradio-client
261
+ # huggingface-hub
262
+ # pydantic
263
+ # pydantic-core
264
+ # spaces
265
+ # starlette
266
+ # torch
267
+ # typer
268
+ # typing-inspection
269
+ typing-inspection==0.4.2
270
+ # via pydantic
271
+ tzdata==2025.2
272
+ # via pandas
273
+ urllib3==2.5.0
274
+ # via requests
275
+ uvicorn==0.38.0
276
+ # via gradio
277
+ websockets==15.0.1
278
+ # via gradio-client
279
+ zipp==3.23.0
280
+ # via importlib-metadata
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
visualize.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple 3D skeleton motion visualizer for HumanML3D motion data.
3
+ Usage: python visualize.py <motion.pt> [--output output.mp4] [--fps 20]
4
+ """
5
+
6
+ import argparse
7
+ import numpy as np
8
+ import torch
9
+ import matplotlib.pyplot as plt
10
+ from matplotlib.animation import FuncAnimation, FFMpegWriter
11
+ from pathlib import Path
12
+
13
+
14
+ # HumanML3D skeleton structure (22 joints)
15
+ # Kinematic chain based on HumanML3D dataset specification
16
+ # From mld/utils/joints.py and datasets/HumanML3D/paramUtil.py
17
+ SKELETON_CHAINS = [
18
+ [0, 3, 6, 9, 12, 15], # Body: root -> BP -> BT -> BLN -> BMN -> BUN (head)
19
+ [9, 14, 17, 19, 21], # Left arm: BLN -> LSI -> LS -> LE -> LW
20
+ [9, 13, 16, 18, 20], # Right arm: BLN -> RSI -> RS -> RE -> RW
21
+ [0, 2, 5, 8, 11], # Left leg: root -> LH -> LK -> LMrot -> LF
22
+ [0, 1, 4, 7, 10], # Right leg: root -> RH -> RK -> RMrot -> RF
23
+ ]
24
+
25
+
26
+ def load_motion(pt_path: str) -> np.ndarray:
27
+ """
28
+ Load motion data from .pt file (PyTorch tensor).
29
+
30
+ HumanML3D format: (frames, 22, 3) where last dimension is (x, y, z)
31
+ In HumanML3D: Y is vertical (up), X and Z are horizontal
32
+ For proper 3D visualization: we'll map Y -> Z (vertical), X -> X, Z -> Y
33
+
34
+ Returns numpy array for matplotlib visualization.
35
+ """
36
+ # Load PyTorch tensor and convert to numpy for visualization
37
+ motion_tensor = torch.load(pt_path, map_location="cpu")
38
+ motion = motion_tensor.numpy()
39
+
40
+ print(f"Loaded motion: {motion.shape}")
41
+ print(f" Frames: {motion.shape[0]}")
42
+ print(f" Joints: {motion.shape[1]}")
43
+ print(f" Dimensions: {motion.shape[2]}")
44
+
45
+ # Remap axes: HumanML3D (x, y, z) -> Visualization (x, z, y)
46
+ # This makes Y axis (vertical in HumanML3D) become Z axis (vertical in plot)
47
+ motion_remapped = motion.copy()
48
+ motion_remapped[:, :, [0, 1, 2]] = motion[:, :, [0, 2, 1]] # x, z, y <- x, y, z
49
+
50
+ return motion_remapped
51
+
52
+
53
+ def setup_3d_plot():
54
+ """Set up the 3D plot with proper viewing angle."""
55
+ fig = plt.figure(figsize=(10, 10))
56
+ ax = fig.add_subplot(111, projection="3d")
57
+
58
+ # Set labels
59
+ ax.set_xlabel("X")
60
+ ax.set_ylabel("Y")
61
+ ax.set_zlabel("Z")
62
+
63
+ return fig, ax
64
+
65
+
66
+ def update_frame(frame_idx: int, motion: np.ndarray, ax, lines: list, points: list):
67
+ """Update function for animation."""
68
+ ax.clear()
69
+
70
+ # Get current frame
71
+ frame = motion[frame_idx]
72
+
73
+ # Set consistent axis limits based on all frames
74
+ all_coords = motion.reshape(-1, 3)
75
+ margin = 0.5
76
+ x_range = [all_coords[:, 0].min() - margin, all_coords[:, 0].max() + margin]
77
+ y_range = [all_coords[:, 1].min() - margin, all_coords[:, 1].max() + margin]
78
+ z_range = [0, all_coords[:, 2].max() + margin] # Z starts at ground (0)
79
+
80
+ ax.set_xlim(x_range)
81
+ ax.set_ylim(y_range)
82
+ ax.set_zlim(z_range)
83
+
84
+ # Set labels and title
85
+ ax.set_xlabel("X", fontsize=10)
86
+ ax.set_ylabel("Y", fontsize=10)
87
+ ax.set_zlabel("Z (Height)", fontsize=10)
88
+ ax.set_title(f"Frame {frame_idx + 1}/{len(motion)}", fontsize=14, pad=20)
89
+
90
+ # Set viewing angle (slightly elevated, rotated for better view)
91
+ ax.view_init(elev=15, azim=45)
92
+
93
+ # Draw ground plane at z=0
94
+ xx, yy = np.meshgrid(
95
+ np.linspace(x_range[0], x_range[1], 2), np.linspace(y_range[0], y_range[1], 2)
96
+ )
97
+ zz = np.zeros_like(xx)
98
+ ax.plot_surface(xx, yy, zz, alpha=0.1, color="gray")
99
+
100
+ # Plot skeleton bones with different colors for different parts
101
+ colors = ["red", "blue", "green", "cyan", "magenta"]
102
+ for chain_idx, chain in enumerate(SKELETON_CHAINS):
103
+ color = colors[chain_idx % len(colors)]
104
+ for i in range(len(chain) - 1):
105
+ j1, j2 = chain[i], chain[i + 1]
106
+ if j1 < len(frame) and j2 < len(frame):
107
+ xs = [frame[j1, 0], frame[j2, 0]]
108
+ ys = [frame[j1, 1], frame[j2, 1]]
109
+ zs = [frame[j1, 2], frame[j2, 2]]
110
+ linewidth = 4.0 if chain_idx == 0 else 3.0 # Thicker for body
111
+ ax.plot(xs, ys, zs, color=color, linewidth=linewidth, alpha=0.8)
112
+
113
+ # Plot joints (darker red)
114
+ ax.scatter(
115
+ frame[:, 0],
116
+ frame[:, 1],
117
+ frame[:, 2],
118
+ c="darkred",
119
+ marker="o",
120
+ s=50,
121
+ alpha=0.9,
122
+ edgecolors="black",
123
+ linewidth=0.5,
124
+ )
125
+
126
+ # Add grid
127
+ ax.grid(True, alpha=0.3)
128
+
129
+ return (ax,)
130
+
131
+
132
+ def create_video_from_joints(
133
+ joints: torch.Tensor | np.ndarray, output_path: str, fps: int = 20
134
+ ) -> str:
135
+ """
136
+ Create 3D skeleton animation directly from joint tensor or array.
137
+
138
+ Args:
139
+ joints: Joint positions as torch.Tensor or np.ndarray (frames, 22, 3)
140
+ output_path: Path to save video
141
+ fps: Frames per second for the video
142
+
143
+ Returns:
144
+ Path to output video
145
+ """
146
+ # Convert to numpy if it's a torch tensor
147
+ if isinstance(joints, torch.Tensor):
148
+ joints = joints.cpu().numpy()
149
+
150
+ # Remap axes for visualization (same as load_motion)
151
+ motion = joints.copy()
152
+ motion[:, :, [0, 1, 2]] = joints[:, :, [0, 2, 1]] # x, z, y <- x, y, z
153
+
154
+ # Set up plot
155
+ fig, ax = setup_3d_plot()
156
+ lines, points = [], []
157
+
158
+ # Create animation
159
+ anim = FuncAnimation(
160
+ fig,
161
+ update_frame,
162
+ frames=len(motion),
163
+ fargs=(motion, ax, lines, points),
164
+ interval=1000 / fps,
165
+ blit=False,
166
+ repeat=True,
167
+ )
168
+
169
+ # Save video using FFMpeg
170
+ writer = FFMpegWriter(fps=fps, bitrate=1800, codec="libx264")
171
+ anim.save(str(output_path), writer=writer, dpi=100)
172
+
173
+ plt.close(fig)
174
+ return str(output_path)
175
+
176
+
177
+ def visualize_motion(
178
+ pt_path: str, output_path: str | None = None, fps: int = 20, show: bool = False
179
+ ) -> str:
180
+ """
181
+ Visualize motion from .pt file (PyTorch tensor).
182
+
183
+ Args:
184
+ pt_path: Path to .pt motion file
185
+ output_path: Path to save video (if None, will auto-generate)
186
+ fps: Frames per second for the video
187
+ show: If True, display the animation in a window
188
+
189
+ Returns:
190
+ Path to the generated video file
191
+ """
192
+ # Load motion data (converts to numpy internally for matplotlib)
193
+ motion = load_motion(pt_path)
194
+
195
+ # Create output path if not specified
196
+ if output_path is None:
197
+ output_path = Path(pt_path).with_suffix(".mp4")
198
+ else:
199
+ output_path = Path(output_path)
200
+
201
+ print(f"\nCreating animation with {fps} FPS...")
202
+
203
+ # Set up plot
204
+ fig, ax = setup_3d_plot()
205
+ lines, points = [], []
206
+
207
+ # Create animation
208
+ anim = FuncAnimation(
209
+ fig,
210
+ update_frame,
211
+ frames=len(motion),
212
+ fargs=(motion, ax, lines, points),
213
+ interval=1000 / fps,
214
+ blit=False,
215
+ repeat=True,
216
+ )
217
+
218
+ # Save video using FFMpeg
219
+ print(f"Saving video to: {output_path}")
220
+ writer = FFMpegWriter(fps=fps, bitrate=1800, codec="libx264")
221
+ anim.save(str(output_path), writer=writer, dpi=100)
222
+ print("✓ Video saved successfully!")
223
+
224
+ # Show animation if requested
225
+ if show:
226
+ plt.show()
227
+
228
+ plt.close(fig)
229
+ return str(output_path)
230
+
231
+
232
+ def main() -> int:
233
+ """Main entry point for CLI"""
234
+ parser = argparse.ArgumentParser(
235
+ description="Visualize HumanML3D motion data as 3D skeleton animation"
236
+ )
237
+ parser.add_argument("input", type=str, help="Path to input .pt motion file")
238
+ parser.add_argument(
239
+ "--output",
240
+ "-o",
241
+ type=str,
242
+ default=None,
243
+ help="Path to output video file (default: input_name.mp4)",
244
+ )
245
+ parser.add_argument(
246
+ "--fps",
247
+ type=int,
248
+ default=20,
249
+ help="Frames per second for output video (default: 20)",
250
+ )
251
+ parser.add_argument(
252
+ "--show",
253
+ action="store_true",
254
+ help="Display the animation in a window (in addition to saving)",
255
+ )
256
+
257
+ args = parser.parse_args()
258
+
259
+ # Check if input file exists
260
+ input_path = Path(args.input)
261
+ if not input_path.exists():
262
+ print(f"Error: Input file not found: {args.input}")
263
+ return 1
264
+
265
+ # Visualize the motion
266
+ try:
267
+ output_path = visualize_motion(
268
+ args.input, output_path=args.output, fps=args.fps, show=args.show
269
+ )
270
+ print(f"\n✓ Done! Video saved to: {output_path}")
271
+ return 0
272
+ except Exception as e:
273
+ print(f"\n✗ Error: {e}")
274
+ import traceback
275
+
276
+ traceback.print_exc()
277
+ return 1
278
+
279
+
280
+ if __name__ == "__main__":
281
+ exit(main())