Spaces:
Running
Running
Add microbatch_group_size_per_vp_stage as configurable.
Browse files- README.md +2 -0
- conf/config.yaml +1 -0
- main.py +1 -0
- src/execution_model.py +5 -0
- src/strategies.py +1 -1
README.md
CHANGED
|
@@ -72,6 +72,8 @@ uv run python main.py strategy=interleave num_devices=4 num_stages=8 num_batches
|
|
| 72 |
```
|
| 73 |

|
| 74 |
|
|
|
|
|
|
|
| 75 |
### Running for ZB-1P strategy:
|
| 76 |
```bash
|
| 77 |
uv run python main.py strategy=zb1p num_devices=4 num_stages=4 num_batches=8
|
|
|
|
| 72 |
```
|
| 73 |

|
| 74 |
|
| 75 |
+
You can optionally setting `microbatch_group_size_per_vp_stage`.
|
| 76 |
+
|
| 77 |
### Running for ZB-1P strategy:
|
| 78 |
```bash
|
| 79 |
uv run python main.py strategy=zb1p num_devices=4 num_stages=4 num_batches=8
|
conf/config.yaml
CHANGED
|
@@ -5,6 +5,7 @@ num_batches: 8
|
|
| 5 |
visualization_port: 8050
|
| 6 |
strategy: "1f1b" # Options: "1f1b", "interleave"
|
| 7 |
p2p_latency: 0.0
|
|
|
|
| 8 |
|
| 9 |
# Operation time configurations
|
| 10 |
op_times:
|
|
|
|
| 5 |
visualization_port: 8050
|
| 6 |
strategy: "1f1b" # Options: "1f1b", "interleave"
|
| 7 |
p2p_latency: 0.0
|
| 8 |
+
microbatch_group_size_per_vp_stage: null
|
| 9 |
|
| 10 |
# Operation time configurations
|
| 11 |
op_times:
|
main.py
CHANGED
|
@@ -71,6 +71,7 @@ def run_interleave(cfg: DictConfig) -> None:
|
|
| 71 |
p2p_latency=cfg.p2p_latency,
|
| 72 |
placement_strategy="interleave",
|
| 73 |
op_times=op_times,
|
|
|
|
| 74 |
)
|
| 75 |
schedule = generate_1f1b_interleave_schedule(schedule_config)
|
| 76 |
schedule.execute()
|
|
|
|
| 71 |
p2p_latency=cfg.p2p_latency,
|
| 72 |
placement_strategy="interleave",
|
| 73 |
op_times=op_times,
|
| 74 |
+
microbatch_group_size_per_vp_stage=cfg.microbatch_group_size_per_vp_stage,
|
| 75 |
)
|
| 76 |
schedule = generate_1f1b_interleave_schedule(schedule_config)
|
| 77 |
schedule.execute()
|
src/execution_model.py
CHANGED
|
@@ -83,6 +83,7 @@ class ScheduleConfig:
|
|
| 83 |
placement_strategy: str = "standard",
|
| 84 |
split_backward: bool = False,
|
| 85 |
op_times: Optional[Dict[str, Union[float, Dict[int, float]]]] = None,
|
|
|
|
| 86 |
):
|
| 87 |
self.num_devices = num_devices
|
| 88 |
self.num_stages = num_stages
|
|
@@ -90,6 +91,10 @@ class ScheduleConfig:
|
|
| 90 |
self.p2p_latency = p2p_latency
|
| 91 |
self.placement_strategy = placement_strategy
|
| 92 |
self.split_backward = split_backward
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
# Initialize default operation times
|
| 95 |
if self.split_backward:
|
|
|
|
| 83 |
placement_strategy: str = "standard",
|
| 84 |
split_backward: bool = False,
|
| 85 |
op_times: Optional[Dict[str, Union[float, Dict[int, float]]]] = None,
|
| 86 |
+
microbatch_group_size_per_vp_stage: Optional[int] = None,
|
| 87 |
):
|
| 88 |
self.num_devices = num_devices
|
| 89 |
self.num_stages = num_stages
|
|
|
|
| 91 |
self.p2p_latency = p2p_latency
|
| 92 |
self.placement_strategy = placement_strategy
|
| 93 |
self.split_backward = split_backward
|
| 94 |
+
if microbatch_group_size_per_vp_stage is None:
|
| 95 |
+
self.microbatch_group_size_per_vp_stage = num_devices
|
| 96 |
+
else:
|
| 97 |
+
self.microbatch_group_size_per_vp_stage = microbatch_group_size_per_vp_stage
|
| 98 |
|
| 99 |
# Initialize default operation times
|
| 100 |
if self.split_backward:
|
src/strategies.py
CHANGED
|
@@ -244,7 +244,7 @@ def generate_1f1b_interleave_schedule(config: ScheduleConfig):
|
|
| 244 |
schedule = Schedule(config)
|
| 245 |
|
| 246 |
for device_id in range(config.num_devices):
|
| 247 |
-
microbatch_group_size_per_vp_stage = config.
|
| 248 |
num_warmup_microbatches = _get_pp_rank_microbatches(
|
| 249 |
config.num_batches,
|
| 250 |
config.num_devices,
|
|
|
|
| 244 |
schedule = Schedule(config)
|
| 245 |
|
| 246 |
for device_id in range(config.num_devices):
|
| 247 |
+
microbatch_group_size_per_vp_stage = config.microbatch_group_size_per_vp_stage
|
| 248 |
num_warmup_microbatches = _get_pp_rank_microbatches(
|
| 249 |
config.num_batches,
|
| 250 |
config.num_devices,
|