Spaces:

Victarry
/

PP-schedule-visualizer

Running

App Files Files Community

Victarry commited on Apr 9

Commit

99a6901

1 Parent(s): a9586a0

Add support for DualPipe.

Browse files

Files changed (8) hide show

.gitignore +1 -0
README.md +21 -6
assets/dualpipe.png +3 -0
conf/config.yaml +3 -0
main.py +23 -0
src/execution_model.py +81 -19
src/strategies.py +227 -2
src/visualizer.py +2 -12

.gitignore CHANGED Viewed

@@ -3,6 +3,7 @@
 uv.lock
 outputs/
 .cursor/*
 # Uncomment below if you want to include these files
 # !assets/*.png

 uv.lock
 outputs/
 .cursor/*
+*.json
 # Uncomment below if you want to include these files
 # !assets/*.png

README.md CHANGED Viewed

@@ -18,6 +18,7 @@ Pipeline parallelism is a technique used to train large models by partitioning t
   - Zero-Bubble 1F1B (ZB-1P)
   - 1F1B with computation-communication overlap
   - Interleaved 1F1B with computation-communication overlap
 - **Visualization**:
   - Interactive visualization dashboard using Plotly/Dash
@@ -56,6 +57,12 @@ uv run python main.py strategy=zb1p num_devices=4 num_stages=4 num_batches=8
 ```
 ![zb1p](assets/zb1p.png)
 ### Running for 1F1B-batch-overlap strategy:
 ```bash
 uv run python main.py strategy=1f1b_overlap num_devices=4 num_stages=4 num_batches=8
@@ -68,10 +75,24 @@ uv run python main.py strategy=1f1b_interleave_overlap num_devices=4 num_stages=
 ```
 ![1f1b_interleave_overlap](assets/1f1b_interleave_overlap.png)
 ## Configuration
 The default configuration is in `conf/config.yaml`. You can override any parameter on the command line or create configuration groups for different scenarios.
 ### Using Different Configuration Files
 You can use different configuration files with Hydra in several ways:
@@ -90,12 +111,6 @@ You can use different configuration files with Hydra in several ways:
    uv run python main.py --config-name=model_A
    ```
-#### Override Specific Parameters
-You can also override specific parameters at runtime:
-```bash
-uv run python main.py op_times.forward=0.5 op_times.backward=1.0 num_batches=6
-```
 ## Project Structure

   - Zero-Bubble 1F1B (ZB-1P)
   - 1F1B with computation-communication overlap
   - Interleaved 1F1B with computation-communication overlap
+  - DualPipe (Bidirectional pipeline parallelism with full forward-backward overlap)
 - **Visualization**:
   - Interactive visualization dashboard using Plotly/Dash
 ```
 ![zb1p](assets/zb1p.png)
+### Running for DualPipe strategy:
+```bash
+uv run python main.py strategy=dualpipe num_devices=8 num_stages=8 num_batches=20
+```
+![dualpipe](assets/dualpipe.png)
 ### Running for 1F1B-batch-overlap strategy:
 ```bash
 uv run python main.py strategy=1f1b_overlap num_devices=4 num_stages=4 num_batches=8
 ```
 ![1f1b_interleave_overlap](assets/1f1b_interleave_overlap.png)
 ## Configuration
 The default configuration is in `conf/config.yaml`. You can override any parameter on the command line or create configuration groups for different scenarios.
+#### Override Specific Parameters
+You can override specific parameters at runtime:
+```bash
+uv run python main.py op_times.forward=0.5 op_times.backward=1.0 num_batches=6
+```
+Use DualPipe as an example, you can manually set different time for forward/backward/backward_D/backward_W/overlapped_forward_backward:
+```bash
+uv run python main.py strategy=dualpipe num_devices=8 num_stages=8 num_batches=32 op_times.forward=1.0 op_times.backward=2.0 op_times.backward_D=1.0 op_times.backward_W=1.0 op_times.overlapped_forward_backward=2.5
+```
 ### Using Different Configuration Files
 You can use different configuration files with Hydra in several ways:
    uv run python main.py --config-name=model_A
    ```
 ## Project Structure

assets/dualpipe.png ADDED Viewed

Git LFS Details

SHA256: 880f2d4aeed62479216a9e8bc480b22ed2d21b469f661d42c9bc67b1ca6bec2f
Pointer size: 131 Bytes
Size of remote file: 158 kB

conf/config.yaml CHANGED Viewed

@@ -11,6 +11,9 @@ op_times:
   # Option 1: Simple configuration (same time for all stages)
   forward: 1.0
   backward: 2.0
   # Option 2: Commented example of stage-specific configuration
   # forward:

   # Option 1: Simple configuration (same time for all stages)
   forward: 1.0
   backward: 2.0
+  backward_D: 1.0
+  backward_W: 1.0
+  overlapped_forward_backward: 2.0
   # Option 2: Commented example of stage-specific configuration
   # forward:

main.py CHANGED Viewed

@@ -5,6 +5,7 @@ from src.strategies import (
     generate_1f1b_overlap_schedule,
     generate_1f1b_schedule,
     generate_zero_bubble_1p_schedule,
 )
 from src.visualizer import visualize_pipeline_parallelism_dash
 import hydra
@@ -26,6 +27,8 @@ def main(cfg: DictConfig) -> None:
         run_1f1b_overlap(cfg)
     elif cfg.strategy == "1f1b_interleave_overlap":
         run_1f1b_interleave_overlap(cfg)
     else:
         raise ValueError(f"Unknown strategy: {cfg.strategy}")
@@ -129,5 +132,25 @@ def run_1f1b_interleave_overlap(cfg: DictConfig) -> None:
     schedule.execute()
     visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
 if __name__ == "__main__":
     main()

     generate_1f1b_overlap_schedule,
     generate_1f1b_schedule,
     generate_zero_bubble_1p_schedule,
+    generate_dualpipe_schedule,
 )
 from src.visualizer import visualize_pipeline_parallelism_dash
 import hydra
         run_1f1b_overlap(cfg)
     elif cfg.strategy == "1f1b_interleave_overlap":
         run_1f1b_interleave_overlap(cfg)
+    elif cfg.strategy == "dualpipe":
+        run_dualpipe(cfg)
     else:
         raise ValueError(f"Unknown strategy: {cfg.strategy}")
     schedule.execute()
     visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
+def run_dualpipe(cfg: DictConfig) -> None:
+    """Run DualPipe pipeline parallelism simulation."""
+    # Convert OmegaConf to dict for op_times if it exists
+    op_times = (
+        OmegaConf.to_container(cfg.op_times) if hasattr(cfg, "op_times") else None
+    )
+    schedule_config = ScheduleConfig(
+        num_devices=cfg.num_devices,
+        num_stages=cfg.num_stages,
+        num_batches=cfg.num_batches,
+        p2p_latency=cfg.p2p_latency,
+        op_times=op_times,
+        split_backward=True,
+        placement_strategy="dualpipe",
+    )
+    schedule = generate_dualpipe_schedule(schedule_config)
+    schedule.execute()
+    visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
 if __name__ == "__main__":
     main()

src/execution_model.py CHANGED Viewed

@@ -69,7 +69,7 @@ class DeviceQueue:
     def add_operation(self, op: Operation):
         assert op.stage_id in self.stages
         self.ops.append(op)
-        assert op.device_id is None
         op.device_id = self.device_id
@@ -97,6 +97,7 @@ class ScheduleConfig:
                 "forward": 1.0,
                 "backward_D": 1.0,
                 "backward_W": 1.0,
             }
         else:
             self.op_times = {
@@ -128,9 +129,14 @@ class ScheduleConfig:
         self.num_stages_per_device = num_stages // num_devices
         self.init_device_to_stages()
-        assert (
-            sum(len(stages) for stages in self.device_to_stages.values()) == num_stages
-        )
     def init_device_to_stages(self):
         if self.placement_strategy == "standard":
@@ -145,14 +151,27 @@ class ScheduleConfig:
             for i in range(self.num_stages):
                 device_to_put = i % self.num_devices
                 self.device_to_stages[device_to_put].append(i)
         else:
             raise ValueError(f"Invalid placement strategy: {self.placement_strategy}")
     def get_op_time(self, op_type: str, stage_id: int):
         # For overlapped operations, extract the original operation types
         if op_type.startswith("overlapped_"):
-            if op_type in self.op_times and self.op_times[op_type][stage_id]:
-                return self.op_times[op_type][stage_id]
             else:
                 op_parts = op_type.split("_")[1:]
                 if len(op_parts) >= 2:
@@ -173,20 +192,25 @@ class ScheduleConfig:
 class Schedule:
-    def __init__(self, config: ScheduleConfig):
         self.ops = {}  # (batch_id, stage_id, op_type) -> Operation
         self.device_queues: List[DeviceQueue] = []
         for dev_id in range(config.num_devices):
             self.device_queues.append(DeviceQueue(config.device_to_stages[dev_id], dev_id))
         self.config = config
-        self.init_operations()
         self.op_to_overlapped = {}
     def register_overlapped_operation(self, overlapped_op: OverlappedOperation):
         for op in overlapped_op.operations:
             self.op_to_overlapped[(op.batch_id, op.stage_id, op.op_type)] = overlapped_op
             self.ops[(op.batch_id, op.stage_id, op.op_type)] = overlapped_op
     def init_operations(self):
         op_types = ["forward", "backward"]
@@ -199,9 +223,12 @@ class Schedule:
                         batch_id, stage_id, op_type
                     )
-    def get_op(self, batch_id: int, stage_id: int, op_type: str):
         if (batch_id, stage_id, op_type) in self.op_to_overlapped:
             return self.op_to_overlapped[(batch_id, stage_id, op_type)]
         return self.ops[(batch_id, stage_id, op_type)]
     def get_dependencies(self, op: Operation, include_device_dependency=True):
@@ -226,20 +253,55 @@ class Schedule:
         if self.config.split_backward:
             if op.op_type == "backward_D":
                 if op.stage_id < self.config.num_stages - 1:
-                    deps.append(
-                        (
-                            self.get_op(op.batch_id, op.stage_id + 1, "backward_D"),
-                            self.config.p2p_latency,
                         )
-                    )
             elif op.op_type == "backward_W":
                 if op.stage_id < self.config.num_stages - 1:
-                    deps.append(
-                        (
-                            self.get_op(op.batch_id, op.stage_id, "backward_D"),
-                            self.config.p2p_latency,
                         )
-                    )
         else:
             if op.op_type == "backward":
                 if op.stage_id < self.config.num_stages - 1:

     def add_operation(self, op: Operation):
         assert op.stage_id in self.stages
         self.ops.append(op)
+        assert op.device_id is None, f"Operation {op.batch_id}, {op.stage_id}, {op.op_type} already has a device id on {op.device_id}"
         op.device_id = self.device_id
                 "forward": 1.0,
                 "backward_D": 1.0,
                 "backward_W": 1.0,
+                "backward": 2.0,
             }
         else:
             self.op_times = {
         self.num_stages_per_device = num_stages // num_devices
         self.init_device_to_stages()
+        if self.placement_strategy == "dualpipe":
+            assert (
+                sum(len(stages) for stages in self.device_to_stages.values()) == num_stages * 2
+            )
+        else:
+            assert (
+                sum(len(stages) for stages in self.device_to_stages.values()) == num_stages
+            )
     def init_device_to_stages(self):
         if self.placement_strategy == "standard":
             for i in range(self.num_stages):
                 device_to_put = i % self.num_devices
                 self.device_to_stages[device_to_put].append(i)
+        elif self.placement_strategy == "dualpipe":
+            # For DualPipe, each device has two stages
+            assert self.num_devices == self.num_stages, "DualPipe requires num_devices == num_stages"
+            assert self.num_devices % 2 == 0, "DualPipe requires an even number of devices"
+            self.device_to_stages = defaultdict(list)
+            for i in range(self.num_stages):
+                self.device_to_stages[i] = [i, self.num_stages - i - 1]
         else:
             raise ValueError(f"Invalid placement strategy: {self.placement_strategy}")
     def get_op_time(self, op_type: str, stage_id: int):
         # For overlapped operations, extract the original operation types
         if op_type.startswith("overlapped_"):
+            if op_type in self.op_times:
+                if isinstance(self.op_times[op_type], dict):
+                    if stage_id in self.op_times[op_type]:
+                        return self.op_times[op_type][stage_id]
+                    else:
+                        raise ValueError(f"No time specified for operation {op_type} at stage {stage_id}")
+                else:
+                    return self.op_times[op_type]
             else:
                 op_parts = op_type.split("_")[1:]
                 if len(op_parts) >= 2:
 class Schedule:
+    def __init__(self, config: ScheduleConfig, init_ops: bool = True):
         self.ops = {}  # (batch_id, stage_id, op_type) -> Operation
         self.device_queues: List[DeviceQueue] = []
         for dev_id in range(config.num_devices):
             self.device_queues.append(DeviceQueue(config.device_to_stages[dev_id], dev_id))
         self.config = config
+        if init_ops:
+            self.init_operations()
         self.op_to_overlapped = {}
     def register_overlapped_operation(self, overlapped_op: OverlappedOperation):
         for op in overlapped_op.operations:
             self.op_to_overlapped[(op.batch_id, op.stage_id, op.op_type)] = overlapped_op
             self.ops[(op.batch_id, op.stage_id, op.op_type)] = overlapped_op
+    def register_operation(self, op: Operation):
+        assert (op.batch_id, op.stage_id, op.op_type) not in self.ops, f"Operation {op.batch_id}, {op.stage_id}, {op.op_type} already registered"
+        self.ops[(op.batch_id, op.stage_id, op.op_type)] = op
     def init_operations(self):
         op_types = ["forward", "backward"]
                         batch_id, stage_id, op_type
                     )
+    def get_op(self, batch_id: int, stage_id: int, op_type: str, allow_none=False):
         if (batch_id, stage_id, op_type) in self.op_to_overlapped:
             return self.op_to_overlapped[(batch_id, stage_id, op_type)]
+        if allow_none:
+            if (batch_id, stage_id, op_type) not in self.ops:
+                return None
         return self.ops[(batch_id, stage_id, op_type)]
     def get_dependencies(self, op: Operation, include_device_dependency=True):
         if self.config.split_backward:
             if op.op_type == "backward_D":
                 if op.stage_id < self.config.num_stages - 1:
+                    op_bwd_d = self.get_op(op.batch_id, op.stage_id + 1, "backward_D", allow_none=True)
+                    if op_bwd_d is not None:
+                        deps.append(
+                            (
+                                op_bwd_d,
+                                self.config.p2p_latency,
+                            )
+                        )
+                    else:
+                        deps.append(
+                            (
+                                self.get_op(op.batch_id, op.stage_id + 1, "backward"),
+                                self.config.p2p_latency,
+                            )
                         )
             elif op.op_type == "backward_W":
                 if op.stage_id < self.config.num_stages - 1:
+                    op_bwd_d = self.get_op(op.batch_id, op.stage_id, "backward_D", allow_none=True)
+                    if op_bwd_d is not None:
+                        deps.append(
+                            (
+                                op_bwd_d,
+                                self.config.p2p_latency,
+                            )
+                        )
+                    else:
+                        deps.append(
+                            (
+                                self.get_op(op.batch_id, op.stage_id, "backward"),
+                                self.config.p2p_latency,
+                            )
+                        )
+            elif op.op_type == "backward":
+                if op.stage_id < self.config.num_stages - 1:
+                    op_bwd = self.get_op(op.batch_id, op.stage_id + 1, "backward", allow_none=True)
+                    if op_bwd is not None:
+                        deps.append(
+                            (
+                                op_bwd,
+                                self.config.p2p_latency,
+                            )
+                        )
+                    else:
+                        deps.append(
+                            (
+                                self.get_op(op.batch_id, op.stage_id + 1, "backward_D"),
+                                self.config.p2p_latency,
+                            )
                         )
         else:
             if op.op_type == "backward":
                 if op.stage_id < self.config.num_stages - 1:

src/strategies.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from collections import defaultdict
-from src.execution_model import OverlappedOperation, Schedule, ScheduleConfig
 def generate_1f1b_schedule(config: ScheduleConfig):
@@ -43,6 +43,7 @@ def generate_zero_bubble_1p_schedule(config: ScheduleConfig):
     schedule = Schedule(config)
     total_batches = config.num_batches
     assert config.num_devices == config.num_stages, "num_devices must be equal to num_stages for ZB-1P"
     for i in range(config.num_devices):
         fwd_batch_id = 0
@@ -354,3 +355,227 @@ def generate_1f1b_interleave_overlap_schedule(config: ScheduleConfig):
     return schedule

+from collections import defaultdict, deque
+from src.execution_model import OverlappedOperation, Operation, Schedule, ScheduleConfig
 def generate_1f1b_schedule(config: ScheduleConfig):
     schedule = Schedule(config)
     total_batches = config.num_batches
     assert config.num_devices == config.num_stages, "num_devices must be equal to num_stages for ZB-1P"
+    assert config.split_backward, "ZB-1P requires split_backward=True"
     for i in range(config.num_devices):
         fwd_batch_id = 0
     return schedule
+def create_overlapped_ops(schedule, batch_id1, batch_id2, stage_id, type1, type2):
+    """
+    Helper function to create overlapped operations correctly.
+    This handles the underlying operation creation and registration to avoid device_id issues.
+    """
+    # Get the operations from the schedule
+    op1 = schedule.ops[(batch_id1, stage_id, type1)]
+    op2 = schedule.ops[(batch_id2, stage_id, type2)]
+    # Create the overlapped operation
+    overlapped_op = OverlappedOperation([op1, op2])
+    # Register in the schedule to ensure proper tracking
+    schedule.register_overlapped_operation(overlapped_op)
+    return overlapped_op
+def generate_dualpipe_schedule(config: ScheduleConfig):
+    """
+    Implements the DualPipe scheduling strategy.
+    DualPipe is a bidirectional pipeline parallelism algorithm that achieves full overlap of forward
+    and backward computation-communication phases and reduces pipeline bubbles.
+    The DualPipe strategy has the following characteristics:
+    1. Requires placement_strategy="dualpipe" in ScheduleConfig (set automatically)
+    2. Each device handles both a forward stage and a reverse stage
+    3. Overlaps forward and backward operations to reduce bubble size
+    4. Assumes config.num_batches corresponds to half the total microbatches in original paper (M).
+    5. Currently only supports split_backward=True.
+    Args:
+        config: The scheduling configuration
+    Returns:
+        A Schedule object with the DualPipe scheduling
+    """
+    # Ensure placement strategy is set for Schedule initialization
+    assert config.placement_strategy == "dualpipe", "DualPipe schedule currently only supports placement_strategy='dualpipe'"
+    # Assertions based on DualPipe requirements
+    assert config.num_stages % 2 == 0, "DualPipe requires an even number of stages (and devices)"
+    assert config.num_devices == config.num_stages, "DualPipe requires num_devices == num_stages"
+    assert config.num_batches % 2 == 0, "DualPipe requires an even number of microbatches (config.num_batches)"
+    # Assertion based on original implementation: num_chunks >= num_ranks * 2
+    # Here, M (config.num_batches) corresponds to half_num_chunks
+    assert config.num_batches >= config.num_devices, "DualPipe requires config.num_batches >= config.num_devices"
+    assert config.split_backward, "DualPipe schedule currently only supports split_backward=True"
+    schedule = Schedule(config, init_ops=False)
+    num_stages = config.num_stages
+    num_devices = config.num_devices
+    # config.num_batches is M in the original paper, which corresponds to half_num_chunks
+    half_num_chunks = config.num_batches // 2
+    num_half_ranks = num_devices // 2
+    fwd_batch_ids = defaultdict(int) # (device_id, phase) -> batch_id
+    bwd_d_batch_ids = defaultdict(int) # (device_id, phase) -> batch_id
+    waited_weight_grad = [deque() for _ in range(num_devices)] # (device_id, ) -> List[(stage_id, batch_id)]
+    for device_id in range(num_devices):
+        is_in_second_half = device_id >= num_half_ranks
+        if is_in_second_half:
+            fwd_batch_ids[device_id, 1] = 0
+            fwd_batch_ids[device_id, 0] = config.num_batches // 2
+            bwd_d_batch_ids[device_id, 1] = 0
+            bwd_d_batch_ids[device_id, 0] = config.num_batches // 2
+        else:
+            fwd_batch_ids[device_id, 0] = 0
+            fwd_batch_ids[device_id, 1] = config.num_batches // 2
+            bwd_d_batch_ids[device_id, 0] = 0
+            bwd_d_batch_ids[device_id, 1] = config.num_batches // 2
+    def get_stage_for_phase(device_id, phase, num_stages, is_in_second_half):
+        stage_fwd_dir = device_id # Stage handled when moving forward (0 to N-1)
+        stage_rev_dir = num_stages - 1 - device_id # Stage handled when moving backward (N-1 to 0)
+        if not is_in_second_half:
+            # First half: phase 0 -> fwd_dir, phase 1 -> rev_dir
+            return stage_fwd_dir if phase == 0 else stage_rev_dir
+        else:
+            # Second half: phase 0 -> rev_dir, phase 1 -> fwd_dir
+            return stage_rev_dir if phase == 0 else stage_fwd_dir
+    def add_op_to_queue(device_id, stage_id, op_type, batch_id):
+        # Retrieve the correct pre-initialized Operation object
+        op = Operation(batch_id, stage_id, op_type)
+        schedule.register_operation(op)
+        # Add to the device queue
+        schedule.device_queues[device_id].add_operation(op)
+    def _schedule_forward_chunk(device_id, phase, is_in_second_half):
+        """Schedules a forward compute operation."""
+        stage_id = get_stage_for_phase(device_id, phase, num_stages, is_in_second_half)
+        batch_id = fwd_batch_ids[device_id, phase]
+        add_op_to_queue(device_id, stage_id, "forward", batch_id)
+        fwd_batch_ids[device_id, phase] += 1
+    def _schedule_backward_chunk(device_id, phase, is_in_second_half):
+        """Schedules a backward_D with backward_W compute operation."""
+        stage_id = get_stage_for_phase(device_id, phase, num_stages, is_in_second_half)
+        batch_id = bwd_d_batch_ids[device_id, phase]
+        add_op_to_queue(device_id, stage_id, "backward", batch_id)
+        bwd_d_batch_ids[device_id, phase] += 1
+    def _schedule_backward_input_chunk(device_id, phase, is_in_second_half):
+        """Schedules a backward_D compute operation."""
+        stage_id = get_stage_for_phase(device_id, phase, num_stages, is_in_second_half)
+        batch_id = bwd_d_batch_ids[device_id, phase]
+        add_op_to_queue(device_id, stage_id, "backward_D", batch_id)
+        bwd_d_batch_ids[device_id, phase] += 1
+        waited_weight_grad[device_id].append((stage_id, batch_id))
+    def _schedule_backward_weight_chunk(device_id):
+        """Schedules a backward_W compute operation."""
+        stage_id, batch_id = waited_weight_grad[device_id].popleft()
+        add_op_to_queue(device_id, stage_id, "backward_W", batch_id)
+    def _schedule_forward_backward_chunk(device_id, fwd_phase, bwd_phase, is_in_second_half):
+        """Schedules an overlapped forward and backward_D compute operation."""
+        fwd_stage_id = get_stage_for_phase(device_id, fwd_phase, num_stages, is_in_second_half)
+        bwd_stage_id = get_stage_for_phase(device_id, bwd_phase, num_stages, is_in_second_half)
+        fwd_batch_id = fwd_batch_ids[device_id, fwd_phase]
+        fwd_op = Operation(fwd_batch_id, fwd_stage_id, "forward")
+        schedule.register_operation(fwd_op)
+        fwd_batch_ids[device_id, fwd_phase] += 1
+        bwd_batch_id_d = bwd_d_batch_ids[device_id, bwd_phase]
+        bwd_op = Operation(bwd_batch_id_d, bwd_stage_id, "backward")
+        schedule.register_operation(bwd_op)
+        bwd_d_batch_ids[device_id, bwd_phase] += 1
+        # Create and register the overlapped operation
+        overlapped_op = OverlappedOperation([fwd_op, bwd_op])
+        schedule.register_overlapped_operation(overlapped_op)
+        # Add the overlapped operation to the queue
+        schedule.device_queues[device_id].add_operation(overlapped_op)
+    # Process each device (rank in original code)
+    for device_id in range(num_devices):
+        half_rank = min(device_id, num_devices - 1 - device_id)
+        is_in_second_half = device_id >= num_half_ranks
+        is_middle_rank = (device_id == num_half_ranks - 1) or (device_id == num_half_ranks)
+        # Map original steps to operation additions
+        # Step 1: nF0
+        step_1_count = (num_half_ranks - half_rank - 1) * 2
+        for _ in range(step_1_count):
+            _schedule_forward_chunk(device_id, 0, is_in_second_half) # F0
+        # Step 2: nF0F1
+        step_2_count = half_rank + 1
+        for i in range(step_2_count):
+            _schedule_forward_chunk(device_id, 0, is_in_second_half) # F0
+            _schedule_forward_chunk(device_id, 1, is_in_second_half) # F1
+        # Step 3: nB1W1F1
+        step_3_count = num_half_ranks - half_rank - 1
+        for _ in range(step_3_count):
+            _schedule_backward_input_chunk(device_id, 1, is_in_second_half) # B1_D
+            _schedule_backward_weight_chunk(device_id,)   # W1
+            _schedule_forward_chunk(device_id, 1, is_in_second_half)  # F1
+        # Step 4 (Main step): nF0B1F1B0
+        step_4_count = half_num_chunks - num_devices + half_rank + 1
+        for i in range(step_4_count):
+            # if i == 0 and is_middle_rank:
+                # Schedule F0, B1_D, W1 sequentially for middle ranks on first iteration
+                # _schedule_forward_chunk(device_id, 0, is_in_second_half) # F0
+                # _schedule_backward_chunk(device_id, 1, is_in_second_half)# B1
+                # _schedule_backward_weight_chunk(device_id, 1, is_in_second_half)  # W1
+            # else:
+            # Overlap F0 and B1_D, then schedule W1
+            _schedule_forward_backward_chunk(device_id, 0, 1, is_in_second_half) # F0+B1
+            # Overlap F1 and B0_D, then schedule W0
+            _schedule_forward_backward_chunk(device_id, 1, 0, is_in_second_half) # F1+B0
+        # Step 5: nB1F1B0
+        step_5_count = num_half_ranks - half_rank - 1
+        for _ in range(step_5_count):
+            _schedule_backward_chunk(device_id, 1, is_in_second_half) # B1_D + B1_W
+            _schedule_forward_backward_chunk(device_id, 1, 0, is_in_second_half) # F1+B0
+        # Step 6: nB1B0
+        step_6_count = half_rank + 1
+        enable_zb = False
+        for i in range(step_6_count):
+            if i == step_6_count // 2 and half_rank % 2 == 1:
+                enable_zb = True
+            if enable_zb:
+                _schedule_backward_input_chunk(device_id, 1, is_in_second_half)
+            else:
+                _schedule_backward_chunk(device_id, 1, is_in_second_half)
+            if i == step_6_count // 2 and half_rank % 2 == 0:
+                enable_zb = True
+            if enable_zb:
+                _schedule_backward_input_chunk(device_id, 0, is_in_second_half)
+            else:
+                _schedule_backward_chunk(device_id, 0, is_in_second_half)
+        # Step 7: nWB0
+        step_7_count = num_half_ranks - half_rank - 1
+        for _ in range(step_7_count):
+            _schedule_backward_weight_chunk(device_id)   # W1 (use gradient from B1_D scheduled previously)
+            _schedule_backward_input_chunk(device_id, 0, is_in_second_half) # B0_D
+        # Step 8: nW
+        step_8_count = half_rank + 1
+        for _ in range(step_8_count):
+            # W0 uses gradients from B0_D scheduled in steps 4, 5, 6.
+            # W1 uses gradients from B1_D scheduled in steps 3, 4, 5, 6.
+            # The last W0 gradients correspond to B0_D from step 6 or 7.
+            _schedule_backward_weight_chunk(device_id)   # W0 (use gradient from B0_D scheduled previously)
+    return schedule

src/visualizer.py CHANGED Viewed

@@ -89,11 +89,6 @@ def get_color(op_type: str, stage_id: int, num_devices: int):
     # Improved teal/turquoise palette with low saturation and high brightness
     backward_d_colors = [
-        "#ccffff",  # Very light cyan
-        "#b3ffff",  # Pale cyan
-        "#99ffff",  # Light cyan
-        "#80ffff",  # Cyan
-        "#66e6e6",  # Soft teal
         "#4dcccc",  # Light teal
         "#33b3b3",  # Teal
         "#009999",  # Medium teal
@@ -102,12 +97,6 @@ def get_color(op_type: str, stage_id: int, num_devices: int):
     # Improved green palette with low saturation and high brightness
     backward_w_colors = [
-        "#ccffe6",  # Very light mint
-        "#b3ffd9",  # Pale mint
-        "#99ffcc",  # Light mint
-        "#80ffbf",  # Mint green
-        "#66e6a6",  # Soft green
-        "#4dcc8c",  # Light green
         "#33b373",  # Medium green
         "#009959",  # Forest green
         "#008040",  # Dark green
@@ -162,7 +151,8 @@ def create_pipeline_figure(
             max_batch = max(max_batch, task["batch"])
     # Flag to determine whether to show text labels
-    show_text_labels = max_batch <= 16
     # Create a figure
     fig = go.Figure()

     # Improved teal/turquoise palette with low saturation and high brightness
     backward_d_colors = [
         "#4dcccc",  # Light teal
         "#33b3b3",  # Teal
         "#009999",  # Medium teal
     # Improved green palette with low saturation and high brightness
     backward_w_colors = [
         "#33b373",  # Medium green
         "#009959",  # Forest green
         "#008040",  # Dark green
             max_batch = max(max_batch, task["batch"])
     # Flag to determine whether to show text labels
+    num_operations_per_device = len(schedule_data[0])
+    show_text_labels = num_operations_per_device <= 64
     # Create a figure
     fig = go.Figure()