Spaces:

Victarry
/

PP-schedule-visualizer

Running

App Files Files Community

Victarry commited on Mar 11

Commit

06107a3

1 Parent(s): f6902dc

Add support for zero-bubble-1P.

Browse files

Files changed (5) hide show

README.md +5 -0
main.py +21 -1
src/execution_model.py +43 -15
src/strategies.py +69 -15
src/visualizer.py +73 -18

README.md CHANGED Viewed

@@ -44,6 +44,11 @@ uv run python main.py strategy=interleave num_devices=4 num_stages=8 num_batches
 ```
 ![interleave](assets/interleave_1f1b.png)
 ## Configuration
 The default configuration is in `conf/config.yaml`. You can override any parameter on the command line or create configuration groups for different scenarios.

 ```
 ![interleave](assets/interleave_1f1b.png)
+Running for ZB-1P strategy:
+```bash
+uv run python main.py strategy=zb1p num_devices=4 num_stages=8 num_batches=8
+```
 ## Configuration
 The default configuration is in `conf/config.yaml`. You can override any parameter on the command line or create configuration groups for different scenarios.

main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from src.execution_model import ScheduleConfig
-from src.strategies import generate_1f1b_interleave_schedule, generate_1f1b_schedule
 from src.visualizer import visualize_pipeline_parallelism_dash
 import hydra
 from omegaconf import DictConfig, OmegaConf
@@ -14,6 +14,8 @@ def main(cfg: DictConfig) -> None:
         run_1f1b(cfg)
     elif cfg.strategy == "interleave":
         run_interleave(cfg)
     else:
         raise ValueError(f"Unknown strategy: {cfg.strategy}")
@@ -55,5 +57,23 @@ def run_interleave(cfg: DictConfig) -> None:
     visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
 if __name__ == "__main__":
     main()

 from src.execution_model import ScheduleConfig
+from src.strategies import generate_1f1b_interleave_schedule, generate_1f1b_schedule, generate_zero_bubble_1p_schedule
 from src.visualizer import visualize_pipeline_parallelism_dash
 import hydra
 from omegaconf import DictConfig, OmegaConf
         run_1f1b(cfg)
     elif cfg.strategy == "interleave":
         run_interleave(cfg)
+    elif cfg.strategy == "zb1p":
+        run_zero_bubble_1p(cfg)
     else:
         raise ValueError(f"Unknown strategy: {cfg.strategy}")
     visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
+def run_zero_bubble_1p(cfg: DictConfig) -> None:
+    """Run zero bubble 1P pipeline parallelism simulation."""
+    # Convert OmegaConf to dict for op_times if it exists
+    op_times = OmegaConf.to_container(cfg.op_times) if hasattr(cfg, 'op_times') else None
+    schedule_config = ScheduleConfig(
+        num_devices=cfg.num_devices,
+        num_stages=cfg.num_stages,
+        num_batches=cfg.num_batches,
+        p2p_latency=cfg.p2p_latency,
+        op_times=op_times,
+        split_backward=True
+    )
+    schedule = generate_zero_bubble_1p_schedule(schedule_config)
+    schedule.execute()
+    visualize_pipeline_parallelism_dash(schedule, port=cfg.visualization_port)
 if __name__ == "__main__":
     main()

src/execution_model.py CHANGED Viewed

@@ -36,6 +36,7 @@ class ScheduleConfig:
         num_batches: int,
         p2p_latency: float = 0.0,
         placement_strategy: str = "standard",
         op_times: Optional[Dict[str, Union[float, Dict[int, float]]]] = None,
     ):
         self.num_devices = num_devices
@@ -43,12 +44,20 @@ class ScheduleConfig:
         self.num_batches = num_batches
         self.p2p_latency = p2p_latency
         self.placement_strategy = placement_strategy
         # Initialize default operation times
-        self.op_times = {
-            "forward": 1.0,
-            "backward": 2.0,
-        }
         # Update with user-provided operation times
         if op_times:
@@ -119,9 +128,10 @@ class Schedule:
         self.init_operations()
-    def init_operations(self, op_types: Optional[List[str]] = None):
-        if op_types is None:
-            op_types = ["forward", "backward"]
         for batch_id in range(self.config.num_batches):
             for stage_id in range(self.config.num_stages):
                 for op_type in op_types:
@@ -142,14 +152,32 @@ class Schedule:
                         self.config.p2p_latency,
                     )
                 )
-        elif op.op_type == "backward":
-            if op.stage_id < self.config.num_stages - 1:
-                deps.append(
-                    (
-                        self.get_op(op.batch_id, op.stage_id + 1, "backward"),
-                        self.config.p2p_latency,
                     )
-                )
         device_index = self.dev_queues[op.device_id].ops.index(op)
         if device_index > 0:
@@ -170,7 +198,7 @@ class Schedule:
             print("-" * 80)
             for op in self.dev_queues[dev_id].ops:
-                op_type = "Forward" if op.op_type == "forward" else "Backward"
                 start = f"{op.start_time:.2f}" if op.start_time is not None else "N/A"
                 end = f"{op.end_time:.2f}" if op.end_time is not None else "N/A"

         num_batches: int,
         p2p_latency: float = 0.0,
         placement_strategy: str = "standard",
+        split_backward: bool = False,
         op_times: Optional[Dict[str, Union[float, Dict[int, float]]]] = None,
     ):
         self.num_devices = num_devices
         self.num_batches = num_batches
         self.p2p_latency = p2p_latency
         self.placement_strategy = placement_strategy
+        self.split_backward = split_backward
         # Initialize default operation times
+        if self.split_backward:
+            self.op_times = {
+                "forward": 1.0,
+                "backward_D": 1.0,
+                "backward_W": 1.0,
+            }
+        else:
+            self.op_times = {
+                "forward": 1.0,
+                "backward": 2.0,
+            }
         # Update with user-provided operation times
         if op_times:
         self.init_operations()
+    def init_operations(self):
+        op_types = ["forward", "backward"]
+        if self.config.split_backward:
+            op_types = ["forward", "backward_D", "backward_W"]
         for batch_id in range(self.config.num_batches):
             for stage_id in range(self.config.num_stages):
                 for op_type in op_types:
                         self.config.p2p_latency,
                     )
                 )
+        if self.config.split_backward:
+            if op.op_type == "backward_D":
+                if op.stage_id < self.config.num_stages - 1:
+                    deps.append(
+                        (
+                            self.get_op(op.batch_id, op.stage_id + 1, "backward_D"),
+                            self.config.p2p_latency,
+                        )
+                    )
+            elif op.op_type == "backward_W":
+                if op.stage_id < self.config.num_stages - 1:
+                    deps.append(
+                        (
+                            self.get_op(op.batch_id, op.stage_id, "backward_D"),
+                            self.config.p2p_latency,
+                        )
+                    )
+        else:
+            if op.op_type == "backward":
+                if op.stage_id < self.config.num_stages - 1:
+                    deps.append(
+                        (
+                            self.get_op(op.batch_id, op.stage_id + 1, "backward"),
+                            self.config.p2p_latency,
+                        )
                     )
         device_index = self.dev_queues[op.device_id].ops.index(op)
         if device_index > 0:
             print("-" * 80)
             for op in self.dev_queues[dev_id].ops:
+                op_type = op.op_type
                 start = f"{op.start_time:.2f}" if op.start_time is not None else "N/A"
                 end = f"{op.end_time:.2f}" if op.end_time is not None else "N/A"

src/strategies.py CHANGED Viewed

@@ -5,6 +5,8 @@ from src.execution_model import Schedule, ScheduleConfig
 def generate_1f1b_schedule(config: ScheduleConfig):
     schedule = Schedule(config)
     for i in range(config.num_devices):
         fwd_batch_id = 0
         bwd_batch_id = 0
@@ -12,30 +14,82 @@ def generate_1f1b_schedule(config: ScheduleConfig):
         steady_batches = config.num_batches - warmup_batches
         for _ in range(warmup_batches):
-            for j in range(len(schedule.dev_queues[i].stages)):
-                schedule.dev_queues[i].add_operation(
-                    schedule.get_op(fwd_batch_id, schedule.dev_queues[i].stages[j], "forward")
-                )
             fwd_batch_id += 1
         for _ in range(steady_batches):
-            for j in range(len(schedule.dev_queues[i].stages)):
-                schedule.dev_queues[i].add_operation(
-                    schedule.get_op(fwd_batch_id, schedule.dev_queues[i].stages[j], "forward")
-                )
             fwd_batch_id += 1
-            for j in range(len(schedule.dev_queues[i].stages)-1, -1, -1):
-                schedule.dev_queues[i].add_operation(
-                    schedule.get_op(bwd_batch_id, schedule.dev_queues[i].stages[j], "backward")
-                )
             bwd_batch_id += 1
         for _ in range(cooldown_batches):
-            for j in range(len(schedule.dev_queues[i].stages)-1, -1, -1):
                 schedule.dev_queues[i].add_operation(
-                    schedule.get_op(bwd_batch_id, schedule.dev_queues[i].stages[j], "backward")
                 )
-            bwd_batch_id += 1
     return schedule

 def generate_1f1b_schedule(config: ScheduleConfig):
     schedule = Schedule(config)
+    assert config.num_devices == config.num_stages, "num_devices must be equal to num_stages for 1F1B"
     for i in range(config.num_devices):
         fwd_batch_id = 0
         bwd_batch_id = 0
         steady_batches = config.num_batches - warmup_batches
         for _ in range(warmup_batches):
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(fwd_batch_id, i, "forward")
+            )
             fwd_batch_id += 1
         for _ in range(steady_batches):
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(fwd_batch_id, i, "forward")
+            )
             fwd_batch_id += 1
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(bwd_batch_id, i, "backward")
+            )
             bwd_batch_id += 1
         for _ in range(cooldown_batches):
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(bwd_batch_id, i, "backward")
+            )
+            bwd_batch_id += 1
+    return schedule
+def generate_zero_bubble_1p_schedule(config: ScheduleConfig):
+    # Create a new schedule with split_backward=True to support backward_D and backward_W operations
+    schedule = Schedule(config)
+    total_batches = config.num_batches
+    assert config.num_devices == config.num_stages, "num_devices must be equal to num_stages for ZB-1P"
+    for i in range(config.num_devices):
+        fwd_batch_id = 0
+        bwd_d_batch_id = 0
+        bwd_w_batch_id = 0
+        cooldown_batches = warmup_batches = config.num_devices - i - 1
+        steady_batches = total_batches - warmup_batches
+        for _ in range(warmup_batches):
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(fwd_batch_id, i, "forward")
+            )
+            fwd_batch_id += 1
+        for _ in range(steady_batches):
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(fwd_batch_id, i, "forward")
+            )
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(bwd_d_batch_id, i, "backward_D")
+            )
+            if fwd_batch_id - bwd_w_batch_id >= config.num_devices - 1:
                 schedule.dev_queues[i].add_operation(
+                    schedule.get_op(bwd_w_batch_id, i, "backward_W")
                 )
+                bwd_w_batch_id += 1
+            bwd_d_batch_id += 1
+            fwd_batch_id += 1
+        for _ in range(cooldown_batches):
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(bwd_d_batch_id, i, "backward_D")
+            )
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(bwd_w_batch_id, i, "backward_W")
+            )
+            bwd_w_batch_id += 1
+            bwd_d_batch_id += 1
+        while bwd_w_batch_id < total_batches:
+            schedule.dev_queues[i].add_operation(
+                schedule.get_op(bwd_w_batch_id, i, "backward_W")
+            )
+            bwd_w_batch_id += 1
     return schedule

src/visualizer.py CHANGED Viewed

@@ -45,10 +45,10 @@ def get_color(op_type: str, stage_id: int, num_devices: int):
     # Color palettes for different virtual stages
     forward_colors = [
         "royalblue",      # Stage 0
-        "lightskyblue",   # Stage 1
-        "cornflowerblue", # Stage 2
         "steelblue",      # Stage 3
-        "dodgerblue",     # Stage 4
         "deepskyblue",    # Stage 5
         "mediumblue",     # Stage 6
         "mediumslateblue",# Stage 7
@@ -56,17 +56,46 @@ def get_color(op_type: str, stage_id: int, num_devices: int):
         "darkslateblue"   # Stage 9
     ]
     backward_colors = [
-        "lightgreen",     # Stage 0
-        "mediumseagreen", # Stage 1
-        "seagreen",       # Stage 2
-        "lightseagreen",  # Stage 3
-        "mediumaquamarine", # Stage 4
-        "mediumspringgreen", # Stage 5
-        "springgreen",    # Stage 6
         "palegreen",      # Stage 7
-        "limegreen",      # Stage 8
-        "forestgreen"     # Stage 9
     ]
     virtual_stage = stage_id // num_devices
@@ -78,6 +107,10 @@ def get_color(op_type: str, stage_id: int, num_devices: int):
         return forward_colors[color_index]
     elif op_type == "backward":
         return backward_colors[color_index]
     else:
         raise ValueError(f"Invalid operation type: {op_type}")
@@ -129,7 +162,7 @@ def create_pipeline_figure(schedule_data: Dict[int, List[Dict]], max_time=None,
         # Sort tasks by start time to ensure correct rendering
         sorted_tasks = sorted(schedule_data[device], key=lambda t: t["start_time"])
         for task in sorted_tasks:
             # Determine task color and text color
             if task["type"] == "forward":
@@ -140,6 +173,14 @@ def create_pipeline_figure(schedule_data: Dict[int, List[Dict]], max_time=None,
                 color = get_color(task["type"], task["stage"], num_devices)
                 text_color = "black"
                 name = "Backward"
             else:
                 color = empty_color
                 text_color = "black"
@@ -221,12 +262,24 @@ def create_pipeline_figure(schedule_data: Dict[int, List[Dict]], max_time=None,
             name=f"Backward (VS {vs})",
             color=get_color("backward", vs * num_devices, num_devices)
         ))
     # If no tasks found, add default legend items
     if not legend_items:
         legend_items = [
             dict(name="Forward (VS 0)", color=get_color("forward", 0, num_devices)),
             dict(name="Backward (VS 0)", color=get_color("backward", 0, num_devices)),
         ]
     for i, item in enumerate(legend_items):
@@ -277,12 +330,12 @@ def create_pipeline_figure(schedule_data: Dict[int, List[Dict]], max_time=None,
             yanchor="top",
             y=1.02,  # Position at the top
             xanchor="right",
-            x=1.15,   # Position to the right of the plot
             title=dict(text="<b>Operation Types:</b>"),
             itemsizing="constant",
             tracegroupgap=0
         ),
-        width=1800,  # Increase width to accommodate the legend
         height=400,  # Maintain current height
         bargap=0,
         bargroupgap=0,
@@ -304,7 +357,7 @@ def create_dash_app(schedule: Schedule, schedule_type="1f1b", enable_caching: bo
     Args:
         schedule: Schedule object to visualize
-        schedule_type: Type of schedule ("1f1b" or custom description)
         enable_caching: Whether to cache the schedule data and figure
     """
     # Process schedule data only once and cache it
@@ -381,7 +434,8 @@ def visualize_pipeline_parallelism_dash(
     schedule: Schedule,
     port: int = 8050,
     debug: bool = False,
-    enable_caching: bool = True
 ):
     """
     Launch a Dash app to visualize the pipeline schedule interactively.
@@ -391,7 +445,8 @@ def visualize_pipeline_parallelism_dash(
         port: Port to run the Dash app on
         debug: Whether to run the Dash app in debug mode
         enable_caching: Whether to cache schedule data and figures
     """
-    app = create_dash_app(schedule, enable_caching=enable_caching)
     print(f"Starting Dash app on http://localhost:{port}/")
     app.run_server(debug=debug, port=port)

     # Color palettes for different virtual stages
     forward_colors = [
         "royalblue",      # Stage 0
+        "cornflowerblue", # Stage 1
+        "dodgerblue",     # Stage 2
         "steelblue",      # Stage 3
+        "lightskyblue",   # Stage 4
         "deepskyblue",    # Stage 5
         "mediumblue",     # Stage 6
         "mediumslateblue",# Stage 7
         "darkslateblue"   # Stage 9
     ]
+    # Updated to orange/brown palette for backward operations
     backward_colors = [
+        "darkorange",     # Stage 0
+        "orange",         # Stage 1
+        "sandybrown",     # Stage 2
+        "peru",           # Stage 3
+        "chocolate",      # Stage 4
+        "sienna",         # Stage 5
+        "saddlebrown",    # Stage 6
+        "brown",          # Stage 7
+        "darkgoldenrod",  # Stage 8
+        "goldenrod"       # Stage 9
+    ]
+    # Updated to teal/turquoise palette for backward_D operations
+    backward_d_colors = [
+        "mediumaquamarine", # Stage 8
+        "cadetblue",      # Stage 2
+        "lightseagreen",  # Stage 6
+        "cyan",           # Stage 0
+        "teal",           # Stage 1
+        "mediumturquoise",# Stage 3
+        "turquoise",      # Stage 4
+        "aquamarine",     # Stage 5
+        "darkturquoise",  # Stage 7
+        "paleturquoise"   # Stage 9
+    ]
+    # Updated to green palette for backward_W operations
+    backward_w_colors = [
+        "limegreen",      # Stage 2
+        "forestgreen",    # Stage 0
+        "green",          # Stage 1
+        "seagreen",       # Stage 3
+        "mediumseagreen", # Stage 4
+        "springgreen",    # Stage 5
+        "mediumspringgreen", # Stage 6
         "palegreen",      # Stage 7
+        "lightgreen",     # Stage 8
+        "darkseagreen"    # Stage 9
     ]
     virtual_stage = stage_id // num_devices
         return forward_colors[color_index]
     elif op_type == "backward":
         return backward_colors[color_index]
+    elif op_type == "backward_D":
+        return backward_d_colors[color_index]
+    elif op_type == "backward_W":
+        return backward_w_colors[color_index]
     else:
         raise ValueError(f"Invalid operation type: {op_type}")
         # Sort tasks by start time to ensure correct rendering
         sorted_tasks = sorted(schedule_data[device], key=lambda t: t["start_time"])
         for task in sorted_tasks:
             # Determine task color and text color
             if task["type"] == "forward":
                 color = get_color(task["type"], task["stage"], num_devices)
                 text_color = "black"
                 name = "Backward"
+            elif task["type"] == "backward_D":
+                color = get_color(task["type"], task["stage"], num_devices)
+                text_color = "black"
+                name = "Backward (Grad)"
+            elif task["type"] == "backward_W":
+                color = get_color(task["type"], task["stage"], num_devices)
+                text_color = "black"
+                name = "Backward (Weight)"
             else:
                 color = empty_color
                 text_color = "black"
             name=f"Backward (VS {vs})",
             color=get_color("backward", vs * num_devices, num_devices)
         ))
+        # Add entries for split backward operations if this is a zb1p schedule
+        if any(task["type"] in ["backward_D", "backward_W"] for device in schedule_data for task in schedule_data[device]):
+            legend_items.append(dict(
+                name=f"Backward Grad (VS {vs})",
+                color=get_color("backward_D", vs * num_devices, num_devices)
+            ))
+            legend_items.append(dict(
+                name=f"Backward Weight (VS {vs})",
+                color=get_color("backward_W", vs * num_devices, num_devices)
+            ))
     # If no tasks found, add default legend items
     if not legend_items:
         legend_items = [
             dict(name="Forward (VS 0)", color=get_color("forward", 0, num_devices)),
             dict(name="Backward (VS 0)", color=get_color("backward", 0, num_devices)),
+            dict(name="Backward Grad (VS 0)", color=get_color("backward_D", 0, num_devices)),
+            dict(name="Backward Weight (VS 0)", color=get_color("backward_W", 0, num_devices)),
         ]
     for i, item in enumerate(legend_items):
             yanchor="top",
             y=1.02,  # Position at the top
             xanchor="right",
+            x=1.20,   # Position further to the right to accommodate more items
             title=dict(text="<b>Operation Types:</b>"),
             itemsizing="constant",
             tracegroupgap=0
         ),
+        width=2000,  # Increase width to accommodate the expanded legend
         height=400,  # Maintain current height
         bargap=0,
         bargroupgap=0,
     Args:
         schedule: Schedule object to visualize
+        schedule_type: Type of schedule ("1f1b", "zb1p", or custom description)
         enable_caching: Whether to cache the schedule data and figure
     """
     # Process schedule data only once and cache it
     schedule: Schedule,
     port: int = 8050,
     debug: bool = False,
+    enable_caching: bool = True,
+    schedule_type="1f1b"
 ):
     """
     Launch a Dash app to visualize the pipeline schedule interactively.
         port: Port to run the Dash app on
         debug: Whether to run the Dash app in debug mode
         enable_caching: Whether to cache schedule data and figures
+        schedule_type: Type of schedule ("1f1b", "zb1p", or custom description)
     """
+    app = create_dash_app(schedule, schedule_type=schedule_type, enable_caching=enable_caching)
     print(f"Starting Dash app on http://localhost:{port}/")
     app.run_server(debug=debug, port=port)