Spaces:

zai-org
/

CogVideoX-2B-Space

Running on Zero

App Files Files Community

akhaliq HF Staff commited on Aug 3, 2022

Commit

7d40f91

1 Parent(s): 86b1386

add image prompt option

Browse files

Files changed (2) hide show

app.py +7 -3
model.py +39 -33

app.py CHANGED Viewed

@@ -40,6 +40,9 @@ def main():
                         label='Only First Stage',
                         value=only_first_stage,
                         visible=not only_first_stage)
                     run_button = gr.Button('Run')
             with gr.Column():
@@ -50,10 +53,10 @@ def main():
                             result_video = gr.Video(show_label=False)
         examples = gr.Examples(
-            examples=[['骑滑板的皮卡丘', False, 1234, True],
-                      ['a cat playing chess', True, 1253, True]],
             fn=model.run_with_translation,
-            inputs=[text, translate, seed, only_first_stage],
             outputs=[translated_text, result_video],
             cache_examples=True)
@@ -66,6 +69,7 @@ def main():
                              translate,
                              seed,
                              only_first_stage,
                          ],
                          outputs=[translated_text, result_video])

                         label='Only First Stage',
                         value=only_first_stage,
                         visible=not only_first_stage)
+                    image_prompt = gr.Image(type="filepath"
+                                            label="Image Prompt",
+                                            value=None)
                     run_button = gr.Button('Run')
             with gr.Column():
                             result_video = gr.Video(show_label=False)
         examples = gr.Examples(
+            examples=[['骑滑板的皮卡丘', False, 1234, True,None],
+                      ['a cat playing chess', True, 1253, True,None]],
             fn=model.run_with_translation,
+            inputs=[text, translate, seed, only_first_stage,image_prompt],
             outputs=[translated_text, result_video],
             cache_examples=True)
                              translate,
                              seed,
                              only_first_stage,
+                             image_prompt
                          ],
                          outputs=[translated_text, result_video])

model.py CHANGED Viewed

@@ -796,7 +796,8 @@ class Model:
                        video_raw_text=None,
                        video_guidance_text='视频',
                        image_text_suffix='',
-                       batch_size=1):
         process_start_time = time.perf_counter()
         generate_frame_num = self.args.generate_frame_num
@@ -828,33 +829,36 @@ class Model:
         seq_1st = torch.tensor(seq_1st, dtype=torch.long,
                                device=self.device).unsqueeze(0)
-        output_list_1st = []
-        for tim in range(max(batch_size // mbz, 1)):
-            start_time = time.perf_counter()
-            output_list_1st.append(
-                my_filling_sequence(
-                    model,
-                    tokenizer,
-                    self.args,
-                    seq_1st.clone(),
-                    batch_size=min(batch_size, mbz),
-                    get_masks_and_position_ids=
-                    get_masks_and_position_ids_stage1,
-                    text_len=text_len_1st,
-                    frame_len=frame_len,
-                    strategy=self.strategy_cogview2,
-                    strategy2=self.strategy_cogvideo,
-                    log_text_attention_weights=1.4,
-                    enforce_no_swin=True,
-                    mode_stage1=True,
-                )[0])
-            elapsed = time.perf_counter() - start_time
-            logger.info(f'[First Frame] Elapsed: {elapsed:.2f}')
-        output_tokens_1st = torch.cat(output_list_1st, dim=0)
-        given_tokens = output_tokens_1st[:, text_len_1st + 1:text_len_1st +
-                                         401].unsqueeze(
-                                             1
-                                         )  # given_tokens.shape: [bs, frame_num, 400]
         # generate subsequent frames:
         total_frames = generate_frame_num
@@ -1167,7 +1171,7 @@ class Model:
             1, 2, 0).to(torch.uint8).numpy()
     def run(self, text: str, seed: int,
-            only_first_stage: bool) -> list[np.ndarray]:
         logger.info('==================== run ====================')
         start = time.perf_counter()
@@ -1188,7 +1192,8 @@ class Model:
             video_raw_text=text,
             video_guidance_text='视频',
             image_text_suffix=' 高清摄影',
-            batch_size=self.args.batch_size)
         if not only_first_stage:
             _, res = self.process_stage2(
                 self.model_stage2,
@@ -1226,12 +1231,13 @@ class AppModel(Model):
     def run_with_translation(
             self, text: str, translate: bool, seed: int,
-            only_first_stage: bool) -> tuple[str | None, str | None]:
-        logger.info(f'{text=}, {translate=}, {seed=}, {only_first_stage=}')
         if translate:
             text = translated_text = self.translator(text)
         else:
             translated_text = None
-        frames = self.run(text, seed, only_first_stage)
         video_path = self.to_video(frames)
         return translated_text, video_path

                        video_raw_text=None,
                        video_guidance_text='视频',
                        image_text_suffix='',
+                       batch_size=1,
+                       image_prompt):
         process_start_time = time.perf_counter()
         generate_frame_num = self.args.generate_frame_num
         seq_1st = torch.tensor(seq_1st, dtype=torch.long,
                                device=self.device).unsqueeze(0)
+        if self.image_prompt is None:
+            output_list_1st = []
+            for tim in range(max(batch_size // mbz, 1)):
+                start_time = time.perf_counter()
+                output_list_1st.append(
+                    my_filling_sequence(
+                        model,
+                        tokenizer,
+                        self.args,
+                        seq_1st.clone(),
+                        batch_size=min(batch_size, mbz),
+                        get_masks_and_position_ids=
+                        get_masks_and_position_ids_stage1,
+                        text_len=text_len_1st,
+                        frame_len=frame_len,
+                        strategy=self.strategy_cogview2,
+                        strategy2=self.strategy_cogvideo,
+                        log_text_attention_weights=1.4,
+                        enforce_no_swin=True,
+                        mode_stage1=True,
+                    )[0])
+                elapsed = time.perf_counter() - start_time
+                logger.info(f'[First Frame] Elapsed: {elapsed:.2f}')
+            output_tokens_1st = torch.cat(output_list_1st, dim=0)
+            given_tokens = output_tokens_1st[:, text_len_1st + 1:text_len_1st +
+                                            401].unsqueeze(
+                                                1
+                                            )  # given_tokens.shape: [bs, frame_num, 400]
+        else:
+            given_tokens = tokenizer.encode(image_path=self.image_prompt, image_size=160).repeat(batch_size, 1).unsqueeze(1)
         # generate subsequent frames:
         total_frames = generate_frame_num
             1, 2, 0).to(torch.uint8).numpy()
     def run(self, text: str, seed: int,
+            only_first_stage: bool,image_prompt: None) -> list[np.ndarray]:
         logger.info('==================== run ====================')
         start = time.perf_counter()
             video_raw_text=text,
             video_guidance_text='视频',
             image_text_suffix=' 高清摄影',
+            batch_size=self.args.batch_size
+            image_prompt=image_prompt)
         if not only_first_stage:
             _, res = self.process_stage2(
                 self.model_stage2,
     def run_with_translation(
             self, text: str, translate: bool, seed: int,
+            only_first_stage: bool,image_prompt: None) -> tuple[str | None, str | None],
+        logger.info(f'{text=}, {translate=}, {seed=}, {only_first_stage=},{image_prompt=}')
         if translate:
             text = translated_text = self.translator(text)
         else:
             translated_text = None
+        frames = self.run(text, seed, only_first_stage,image_prompt)
         video_path = self.to_video(frames)
         return translated_text, video_path