FireFlow

Running on Zero

App Files Files Community

shuanholmes commited on Dec 13, 2024

Commit

bf00c4c

1 Parent(s): d429710

[FireFlow] Init Commit

Browse files

Files changed (3) hide show

app.py +84 -74
flux/modules/layers.py +38 -12
flux/sampling.py +19 -15

app.py CHANGED Viewed

@@ -45,24 +45,26 @@ def encode(init_image, torch_device):
         init_image = ae.encode(init_image.to()).to(torch.bfloat16)
     return init_image
 device = "cuda" if torch.cuda.is_available() else "cpu"
 name = 'flux-dev'
-ae = load_ae(name, device)
 t5 = load_t5(device, max_length=256 if name == "flux-schnell" else 512)
 clip = load_clip(device)
-model = load_flow_model(name, device=device)
-offload = False
-name = "flux-dev"
 is_schnell = False
-feature_path = 'feature'
 output_dir = 'result'
 add_sampling_metadata = True
 @spaces.GPU(duration=120)
 @torch.inference_mode()
-def edit(init_image, source_prompt, target_prompt, num_steps, inject_step, guidance, seed):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch.cuda.empty_cache()
     seed = None
@@ -76,15 +78,12 @@ def edit(init_image, source_prompt, target_prompt, num_steps, inject_step, guida
     width, height = init_image.shape[0], init_image.shape[1]
     init_image = torch.from_numpy(init_image).permute(2, 0, 1).float() / 127.5 - 1
     init_image = init_image.unsqueeze(0)
     init_image = init_image.to(device)
     with torch.no_grad():
         init_image = ae.encode(init_image.to()).to(torch.bfloat16)
-    print(init_image.shape)
     rng = torch.Generator(device="cpu")
     opts = SamplingOptions(
             source_prompt=source_prompt,
@@ -97,6 +96,11 @@ def edit(init_image, source_prompt, target_prompt, num_steps, inject_step, guida
         )
     if opts.seed is None:
         opts.seed = torch.Generator(device="cpu").seed()
     print(f"Generating with seed {opts.seed}:\n{opts.source_prompt}")
     t0 = time.perf_counter()
@@ -106,12 +110,23 @@ def edit(init_image, source_prompt, target_prompt, num_steps, inject_step, guida
     #############inverse#######################
     info = {}
     info['feature'] = {}
-    info['inject_step'] = inject_step
     with torch.no_grad():
         inp = prepare(t5, clip, init_image, prompt=opts.source_prompt)
         inp_target = prepare(t5, clip, init_image, prompt=opts.target_prompt)
     timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
     # inversion initial noise
     with torch.no_grad():
@@ -137,6 +152,11 @@ def edit(init_image, source_prompt, target_prompt, num_steps, inject_step, guida
             idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
         else:
             idx = 0
     device = torch.device("cuda")
     with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
@@ -166,97 +186,87 @@ def edit(init_image, source_prompt, target_prompt, num_steps, inject_step, guida
     return img
-def create_demo(model_name: str, device: str = "cuda:0" if torch.cuda.is_available() else "cpu", offload: bool = False):
     is_schnell = model_name == "flux-schnell"
     title = r"""
-        <h1 align="center">🪄 Taming Rectified Flow for Inversion and Editing</h1>
         """
     description = r"""
-        <b>Official 🤗 Gradio demo</b> for <a href='https://github.com/wangjiangshan0725/RF-Solver-Edit' target='_blank'><b>Taming Rectified Flow for Inversion and Editing</b></a>.<br>
-        ❗️❗️❗️[<b>Important</b>] Editing steps:<br>
-        1️⃣ Upload images you want to edit (The resolution is expected be less than 1360*768, or the memory of GPU may be not enough.) <br>
-        2️⃣ Enter the source prompt, which describes the content of the image you unload. The source prompt is not mandatory; you can also leave it to null. <br>
-        3️⃣ Enter the target prompt which describes the expected content of the edited image. <br>
-        4️⃣ Click the <b>Generate</b> button to start editing. <br>
-        5️⃣ We suggest to adjust the value of **feature sharing steps** for better results.<br>
-        """
-    article = r"""
-    If our work is helpful, please help to ⭐ the <a href='https://github.com/wangjiangshan0725/RF-Solver-Edit' target='_blank'>Github Repo</a>. Thanks!
     """
-    badge = r"""
-    [![GitHub Stars](https://img.shields.io/github/stars/wangjiangshan0725/RF-Solver-Edit?style=social)](https://github.com/wangjiangshan0725/RF-Solver-Edit)
     """
     css = '''
     .gradio-container {width: 85% !important}
     '''
     with gr.Blocks(css=css) as demo:
-        # gr.Markdown(f"# Official Demo for Taming Rectified Flow for Inversion and Editing")
         gr.HTML(title)
         gr.Markdown(description)
         gr.Markdown(article)
-        gr.Markdown(badge)
         with gr.Row():
             with gr.Column():
-                source_prompt = gr.Textbox(label="Source Prompt", value="")
-                target_prompt = gr.Textbox(label="Target Prompt", value="")
-                # source_prompt = gr.Text(
-                #     label="Source Prompt",
-                #     show_label=False,
-                #     max_lines=1,
-                #     placeholder="Enter your source prompt",
-                #     container=False,
-                #     value=""
-                # )
-                # target_prompt = gr.Text(
-                #     label="Target Prompt",
-                #     show_label=False,
-                #     max_lines=1,
-                #     placeholder="Enter your target prompt",
-                #     container=False,
-                #     value=""
-                # )
                 init_image = gr.Image(label="Input Image", visible=True)
                 generate_btn = gr.Button("Generate")
             with gr.Column():
                 with gr.Accordion("Advanced Options", open=True):
-                    num_steps = gr.Slider(1, 30, 25, step=1, label="Total timesteps")
-                    inject_step = gr.Slider(1, 15, 3, step=1, label="Feature sharing steps")
-                    guidance = gr.Slider(1.0, 10.0, 2, step=0.1, label="Guidance", interactive=not is_schnell)
-                    # seed = gr.Textbox(0, label="Seed (-1 for random)", visible=False)
-                    # add_sampling_metadata = gr.Checkbox(label="Add sampling parameters to metadata?", value=False)
                 output_image = gr.Image(label="Generated Image")
         generate_btn.click(
             fn=edit,
-            inputs=[init_image, source_prompt, target_prompt, num_steps, inject_step, guidance],
             outputs=[output_image]
         )
     return demo
-# if __name__ == "__main__":
-#     import argparse
-#     parser = argparse.ArgumentParser(description="Flux")
-#     parser.add_argument("--name", type=str, default="flux-dev", choices=list(configs.keys()), help="Model name")
-#     parser.add_argument("--device", type=str, default="cuda:0" if torch.cuda.is_available() else "cpu", help="Device to use")
-#     parser.add_argument("--offload", action="store_true", help="Offload model to CPU when not in use")
-#     parser.add_argument("--share", action="store_true", help="Create a public link to your demo")
-#     parser.add_argument("--port", type=int, default=41035)
-#     args = parser.parse_args()
 demo = create_demo("flux-dev", "cuda")
 demo.launch()

         init_image = ae.encode(init_image.to()).to(torch.bfloat16)
     return init_image
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+offload = True
 device = "cuda" if torch.cuda.is_available() else "cpu"
 name = 'flux-dev'
+ae = load_ae(name, device="cpu" if offload else torch_device)
 t5 = load_t5(device, max_length=256 if name == "flux-schnell" else 512)
 clip = load_clip(device)
+model = load_flow_model(name, device="cpu" if offload else torch_device)
+if offload:
+    model.cpu()
+    torch.cuda.empty_cache()
+    ae.encoder.to(torch_device)
 is_schnell = False
 output_dir = 'result'
 add_sampling_metadata = True
 @spaces.GPU(duration=120)
 @torch.inference_mode()
+def edit(init_image, source_prompt, target_prompt, editing_strategy, num_steps, inject_step, guidance, seed):
+    global ae, t5, clip, model, name, is_schnell, output_dir, add_sampling_metadata
     device = "cuda" if torch.cuda.is_available() else "cpu"
     torch.cuda.empty_cache()
     seed = None
     width, height = init_image.shape[0], init_image.shape[1]
     init_image = torch.from_numpy(init_image).permute(2, 0, 1).float() / 127.5 - 1
     init_image = init_image.unsqueeze(0)
     init_image = init_image.to(device)
     with torch.no_grad():
         init_image = ae.encode(init_image.to()).to(torch.bfloat16)
     rng = torch.Generator(device="cpu")
     opts = SamplingOptions(
             source_prompt=source_prompt,
         )
     if opts.seed is None:
         opts.seed = torch.Generator(device="cpu").seed()
+    if offload:
+        ae = ae.cpu()
+        torch.cuda.empty_cache()
+        t5, clip = t5.to(torch_device), clip.to(torch_device)
     print(f"Generating with seed {opts.seed}:\n{opts.source_prompt}")
     t0 = time.perf_counter()
     #############inverse#######################
     info = {}
     info['feature'] = {}
+    info['inject_step'] = min(inject_step, num_steps)
+    info['reuse_v']= False
+    info['editing_strategy']= " ".join(editing_strategy)
+    info['start_layer_index'] = 20
+    info['end_layer_index'] = 37
+    qkv_ratio = '1.0,1.0,1.0'
+    info['qkv_ratio'] = list(map(float, qkv_ratio.split(',')))
     with torch.no_grad():
         inp = prepare(t5, clip, init_image, prompt=opts.source_prompt)
         inp_target = prepare(t5, clip, init_image, prompt=opts.target_prompt)
     timesteps = get_schedule(opts.num_steps, inp["img"].shape[1], shift=(name != "flux-schnell"))
+    if offload:
+        t5, clip = t5.cpu(), clip.cpu()
+        torch.cuda.empty_cache()
+        model = model.to(torch_device)
     # inversion initial noise
     with torch.no_grad():
             idx = max(int(fn.split("_")[-1].split(".")[0]) for fn in fns) + 1
         else:
             idx = 0
+    if offload:
+        model.cpu()
+        torch.cuda.empty_cache()
+        ae.decoder.to(x.device)
     device = torch.device("cuda")
     with torch.autocast(device_type=device.type, dtype=torch.bfloat16):
     return img
+def create_demo(model_name: str, device: str = "cuda:0" if torch.cuda.is_available() else "cpu"):
     is_schnell = model_name == "flux-schnell"
     title = r"""
+        <h1 align="center">🔥FireFlow: Fast Inversion of Rectified Flow for Image Semantic Editing</h1>
         """
     description = r"""
+        <b>Official 🤗 Gradio Demo</b> for <a href='https://github.com/HolmesShuan/FireFlow-Fast-Inversion-of-Rectified-Flow-for-Image-Semantic-Editing' target='_blank'><b>🔥FireFlow: Fast Inversion of Rectified Flow for Image Semantic Editing</b></a>.<br>
     """
+    article = r"""
+    If you find our work helpful, we would greatly appreciate it if you could ⭐ our <a href='https://github.com/HolmesShuan/FireFlow-Fast-Inversion-of-Rectified-Flow-for-Image-Semantic-Editing' target='_blank'>GitHub repository</a>. Thank you for your support!
     """
     css = '''
     .gradio-container {width: 85% !important}
     '''
     with gr.Blocks(css=css) as demo:
+        # Add a title, description, and additional information
         gr.HTML(title)
         gr.Markdown(description)
         gr.Markdown(article)
+        # Layout: Two columns
         with gr.Row():
+            # Left Column: Inputs
             with gr.Column():
                 init_image = gr.Image(label="Input Image", visible=True)
+                source_prompt = gr.Textbox(label="Source Prompt", value="", placeholder="(Optional) Describe the content of the uploaded image.")
+                target_prompt = gr.Textbox(label="Target Prompt", value="", placeholder="(Required) Describe the desired content of the edited image.")
+                # CheckboxGroup for editing strategies
+                editing_strategy = gr.CheckboxGroup(
+                    label="Editing Technique",
+                    choices=['replace_v', 'add_q', 'add_k'],
+                    value=['replace_v'],  # Default: none selected
+                    interactive=True
+                )
                 generate_btn = gr.Button("Generate")
+            # Right Column: Advanced options and output
             with gr.Column():
                 with gr.Accordion("Advanced Options", open=True):
+                    num_steps = gr.Slider(
+                        minimum=1,
+                        maximum=30,
+                        value=8,
+                        step=1,
+                        label="Total timesteps"
+                    )
+                    inject_step = gr.Slider(
+                        minimum=1,
+                        maximum=15,
+                        value=1,
+                        step=1,
+                        label="Feature sharing steps"
+                    )
+                    guidance = gr.Slider(
+                        minimum=1.0,
+                        maximum=8.0,
+                        value=2.0,
+                        step=0.1,
+                        label="Guidance",
+                        interactive=not is_schnell
+                    )
+                # Output display
                 output_image = gr.Image(label="Generated Image")
+        # Button click event to trigger the edit function
         generate_btn.click(
             fn=edit,
+            inputs=[
+                init_image,
+                source_prompt,
+                target_prompt,
+                editing_strategy,  # Include the selected editing strategies
+                num_steps,
+                inject_step,
+                guidance
+            ],
             outputs=[output_image]
         )
     return demo
 demo = create_demo("flux-dev", "cuda")
 demo.launch()

flux/modules/layers.py CHANGED Viewed

@@ -243,21 +243,47 @@ class SingleStreamBlock(nn.Module):
         q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         q, k = self.norm(q, k, v)
-        # Note: If the memory of your device is not enough, you may consider uncomment the following code.
-        # if info['inject'] and info['id'] > 19:
-        #     store_path = os.path.join(info['feature_path'], str(info['t']) + '_' + str(info['second_order']) + '_' + str(info['id']) + '_' + info['type'] + '_' + 'V' + '.pth')
-        #     if info['inverse']:
-        #         torch.save(v, store_path)
-        #     if not info['inverse']:
-        #         v = torch.load(store_path, weights_only=True)
         # Save the features in the memory
-        if info['inject'] and info['id'] > 19:
-            feature_name = str(info['t']) + '_' + str(info['second_order']) + '_' + str(info['id']) + '_' + info['type'] + '_' + 'V'
             if info['inverse']:
-                info['feature'][feature_name] = v.cpu()
             else:
-                v = info['feature'][feature_name].cuda()
         # compute attention
         attn = attention(q, k, v, pe=pe)

         q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         q, k = self.norm(q, k, v)
         # Save the features in the memory
+        if info['inject'] and info['id'] <= info['end_layer_index'] and info['id'] >= info['start_layer_index']:
+            v_feature_name = str(info['t']) + '_' + str(info['second_order']) + '_' + str(info['id']) + '_' + info['type'] + '_' + 'V'
+            k_feature_name = str(info['t']) + '_' + str(info['second_order']) + '_' + str(info['id']) + '_' + info['type'] + '_' + 'K'
+            q_feature_name = str(info['t']) + '_' + str(info['second_order']) + '_' + str(info['id']) + '_' + info['type'] + '_' + 'Q'
             if info['inverse']:
+                if info['reuse_v']:
+                    info['feature'][v_feature_name] = v.cpu()
+                else:
+                    editing_strategy = info['editing_strategy']
+                    qkv_ratio = info['qkv_ratio']
+                    if 'q' in editing_strategy:
+                        info['feature'][q_feature_name] = (q * qkv_ratio[0]).cpu()
+                    if 'k' in editing_strategy:
+                        info['feature'][k_feature_name] = (k * qkv_ratio[1]).cpu()
+                    if 'v' in editing_strategy:
+                        info['feature'][v_feature_name] = (v * qkv_ratio[2]).cpu()
             else:
+                if info['reuse_v']:
+                    if v_feature_name in info['feature']:
+                        v = info['feature'][v_feature_name].cuda()
+                else:
+                    editing_strategy = info['editing_strategy']
+                    if 'replace_v' in editing_strategy:
+                        if v_feature_name in info['feature']:
+                            v = info['feature'][v_feature_name].cuda()
+                    if 'add_v' in editing_strategy:
+                        if v_feature_name in info['feature']:
+                            v += info['feature'][v_feature_name].cuda()
+                    if 'replace_k' in editing_strategy:
+                        if k_feature_name in info['feature']:
+                            k = info['feature'][k_feature_name].cuda()
+                    if 'add_k' in editing_strategy:
+                        if k_feature_name in info['feature']:
+                            k += info['feature'][k_feature_name].cuda()
+                    if 'replace_q' in editing_strategy:
+                        if q_feature_name in info['feature']:
+                            q = info['feature'][q_feature_name].cuda()
+                    if 'add_q' in editing_strategy:
+                        if q_feature_name in info['feature']:
+                            q += info['feature'][q_feature_name].cuda()
         # compute attention
         attn = attention(q, k, v, pe=pe)

flux/sampling.py CHANGED Viewed

@@ -97,6 +97,7 @@ def denoise(
     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     step_list = []
     for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         info['t'] = t_prev if inverse else t_curr
@@ -104,20 +105,23 @@ def denoise(
         info['second_order'] = False
         info['inject'] = inject_list[i]
-        pred, info = model(
-            img=img,
-            img_ids=img_ids,
-            txt=txt,
-            txt_ids=txt_ids,
-            y=vec,
-            timesteps=t_vec,
-            guidance=guidance_vec,
-            info=info
-        )
         img_mid = img + (t_prev - t_curr) / 2 * pred
-        t_vec_mid = torch.full((img.shape[0],), (t_curr + (t_prev - t_curr) / 2), dtype=img.dtype, device=img.device)
         info['second_order'] = True
         pred_mid, info = model(
             img=img_mid,
@@ -129,9 +133,9 @@ def denoise(
             guidance=guidance_vec,
             info=info
         )
-        first_order = (pred_mid - pred) / ((t_prev - t_curr) / 2)
-        img = img + (t_prev - t_curr) * pred + 0.5 * (t_prev - t_curr) ** 2 * first_order
     return img, info

     guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
     step_list = []
+    next_step_velocity = None
     for i, (t_curr, t_prev) in enumerate(zip(timesteps[:-1], timesteps[1:])):
         t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
         info['t'] = t_prev if inverse else t_curr
         info['second_order'] = False
         info['inject'] = inject_list[i]
+        if next_step_velocity is None:
+            pred, info = model(
+                img=img,
+                img_ids=img_ids,
+                txt=txt,
+                txt_ids=txt_ids,
+                y=vec,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+                info=info
+            )
+        else:
+            pred = next_step_velocity
         img_mid = img + (t_prev - t_curr) / 2 * pred
+        t_vec_mid = torch.full((img.shape[0],), t_curr + (t_prev - t_curr) / 2, dtype=img.dtype, device=img.device)
         info['second_order'] = True
         pred_mid, info = model(
             img=img_mid,
             guidance=guidance_vec,
             info=info
         )
+        next_step_velocity = pred_mid
+        img = img + (t_prev - t_curr) * pred_mid
     return img, info