Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on May 29

Commit

ec8d7fa

verified ·

1 Parent(s): 40afddd

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -170

app.py CHANGED Viewed

@@ -1,181 +1,139 @@
 import os
-import random
 import uuid
 import gradio as gr
 import numpy as np
 from PIL import Image
-import torch
-from diffusers import DiffusionPipeline
-import spaces
-# Setup
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/stable-diffusion-3.5-large-turbo"
-torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-pipe.load_lora_weights("strangerzonehf/SD3.5-Turbo-Portrait-LoRA", weight_name="SD3.5-Turbo-Portrait.safetensors")
-pipe.fuse_lora(lora_scale=1.0)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# Style presets
-style_list = [
-    {
-        "name": "3840 x 2160",
-        "prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
-        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
-    },
-    {
-        "name": "2560 x 1440",
-        "prompt": "hyper-realistic 4K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
-        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
-    },
-    {
-        "name": "HD+",
-        "prompt": "hyper-realistic 2K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
-        "negative_prompt": "cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly",
-    },
-    {
-        "name": "Style Zero",
-        "prompt": "{prompt}",
-        "negative_prompt": "",
-    },
-]
-STYLE_NAMES = [s["name"] for s in style_list]
-def randomize_seed_fn(seed, randomize):
-    return random.randint(0, MAX_SEED) if randomize else seed
-def save_image(img):
-    filename = str(uuid.uuid4()) + ".png"
-    img.save(filename)
-    return filename
 @spaces.GPU
-def generate_images(
-    prompt,
-    style,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    num_images,
-    progress=gr.Progress(track_tqdm=True)
-):
-    seed = randomize_seed_fn(seed, randomize_seed)
-    generator = torch.Generator(device=device).manual_seed(seed)
-    selected_style = next(s for s in style_list if s["name"] == style)
-    styled_prompt = selected_style["prompt"].format(prompt=prompt)
-    styled_negative_prompt = selected_style["negative_prompt"] if not negative_prompt else negative_prompt
-    images = []
-    for _ in range(num_images):
-        image = pipe(
-            prompt=styled_prompt,
-            negative_prompt=styled_negative_prompt,
-            width=width,
-            height=height,
-            guidance_scale=guidance_scale,
-            num_inference_steps=num_inference_steps,
-            generator=generator
-        ).images[0]
-        images.append(image)
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
-# CSS & Interface
-css = '''
-.gradio-container {
-    max-width: 150%;
-    margin: 0 auto;
-}
-h1 { text-align: center; }
-footer { visibility: hidden; }
-'''
-examples = [
-    "portrait photo of a futuristic astronaut",
-    "macro shot of a water droplet on a leaf",
-    "hyper-realistic food photography of a burger",
-    "cyberpunk city at night, rain, neon lights",
-    "ultra detailed fantasy landscape with dragons",
-]
-with gr.Blocks(css=css, theme="YTheme/GMaterial") as demo:
-    gr.Markdown("## SD3.5 Turbo Portrait")
-    with gr.Row():
-        with gr.Column(scale=1):
-            with gr.Row():
-                prompt = gr.Text(
-                    show_label=False,
-                    max_lines=1,
-                    placeholder="Enter your prompt",
-                    container=False,
-                )
-                run_button = gr.Button("Run", scale=0, variant="primary")
-            result_gallery = gr.Gallery(show_label=False, format="png", columns=2, object_fit="contain")
-            with gr.Accordion("Advanced Settings", open=False):
-                num_images = gr.Slider(
-                    label="Number of Images",
-                    minimum=1,
-                    maximum=10,
-                    value=5,
-                    step=1,
-                )
-                style = gr.Dropdown(label="Select Style", choices=STYLE_NAMES, value=STYLE_NAMES[0])
-                negative_prompt = gr.Text(
-                    label="Negative Prompt",
-                    max_lines=4,
-                    lines=3,
-                    value="cartoonish, low resolution, blurry, simplistic, abstract, deformed, ugly"
-                )
-                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
-                randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-                with gr.Row():
-                    width = gr.Slider(label="Width", minimum=512, maximum=MAX_IMAGE_SIZE, step=64, value=1024)
-                    height = gr.Slider(label="Height", minimum=512, maximum=MAX_IMAGE_SIZE, step=64, value=1024)
-                with gr.Row():
-                    guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=15, step=0.5, value=0.0)
-                    num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=30, step=1, value=4)
-        with gr.Column(scale=1):
-            gr.Examples(
-                examples=examples,
-                inputs=prompt,
-                cache_examples=False,
-            )
-    gr.on(
-        triggers=[prompt.submit, run_button.click],
-        fn=generate_images,
-        inputs=[
-            prompt,
-            style,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-            num_images
         ],
-        outputs=[result_gallery, seed],
-        api_name="generate"
     )
 if __name__ == "__main__":
-    demo.queue(max_size=40).launch(ssr_mode=False)

 import os
 import uuid
+import time
+import asyncio
+from threading import Thread
 import gradio as gr
+import spaces
+import torch
 import numpy as np
 from PIL import Image
+import cv2
+import edge_tts
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+    TextIteratorStreamer
+)
+# Constants
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Load multimodal processor and model (Callisto OCR3)
+MODEL_ID = "nvidia/Cosmos-Reason1-7B"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+def downsample_video(video_path: str, num_frames: int = 10):
+    vidcap = cv2.VideoCapture(video_path)
+    total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    idxs = np.linspace(0, total - 1, num_frames, dtype=int)
+    frames = []
+    for i in idxs:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        ok, img = vidcap.read()
+        if ok:
+            rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            pil = Image.fromarray(rgb)
+            timestamp = round(i / fps, 2)
+            frames.append((pil, timestamp))
+    vidcap.release()
+    return frames
+def progress_bar_html(label: str) -> str:
+    return f'''<div style="display:flex; align-items:center;">
+  <span style="margin-right:10px; font-size:14px;">{label}</span>
+  <div style="width:110px; height:5px; background:#B0E0E6; border-radius:2px; overflow:hidden;">
+    <div style="width:100%; height:100%; background:#00FFFF; animation:load 1.5s linear infinite;"></div>
+  </div>
+</div>
+<style>@keyframes load{{0%{{transform:translateX(-100%)}}100%{{transform:translateX(100%)}}}}</style>'''
 @spaces.GPU
+def generate(prompt: str, files: list[str] = None):
+    files = files or []
+    # Determine mode
+    is_video = any(f.lower().endswith(('.mp4', '.avi', '.mov')) for f in files)
+    is_image = any(f.lower().endswith(('.jpg', '.png', '.jpeg', '.bmp')) for f in files)
+    if is_video:
+        yield progress_bar_html("Processing video with cosmos-reason1")
+        video = files[0]
+        frames = downsample_video(video)
+        # Build messages
+        messages = [
+            {"role": "system", "content": [{"type":"text","text":"You are a helpful assistant."}]},
+            {"role": "user", "content": [{"type":"text","text": prompt}]}
+        ]
+        for img, ts in frames:
+            path = f"frame_{uuid.uuid4().hex}.png"
+            img.save(path)
+            messages[1]["content"].extend([
+                {"type":"text","text": f"Frame {ts}:"},
+                {"type":"image","url": path}
+            ])
+        inputs = processor.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True,
+            return_dict=True, return_tensors="pt",
+            truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        Thread(target=model.generate, kwargs={**inputs, "streamer": streamer}).start()
+        buffer = ""
+        for txt in streamer:
+            buffer += txt.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+        return
+    if is_image:
+        yield progress_bar_html("Processing image with cosmos-reason1")
+        imgs = [Image.open(f) for f in files]
+        messages = [
+            {"role":"user","content":[*[{"type":"image","image":i} for i in imgs],{"type":"text","text":prompt}]}]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(
+            text=[prompt_full], images=imgs,
+            return_tensors="pt", padding=True,
+            truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
+        ).to(device)
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        Thread(target=model.generate, kwargs={**inputs, "streamer": streamer}).start()
+        out = ""
+        for txt in streamer:
+            out += txt.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield out
+        return
+    # No valid media
+    yield "Please upload at least one image or a video for inference."
+def main():
+    demo = gr.ChatInterface(
+        fn=generate,
+        additional_inputs=[
+            gr.File(label="Upload Images/Videos", file_types=["image", "video"], file_count="multiple")
         ],
+        description="# **cosmos-reason1 by nvidia**",
+        textbox=gr.Textbox(label="Prompt"),
+        cache_examples=False,
+        type="messages",
+        multimodal=True,
+        stop_btn="Stop Generation"
     )
+    demo.queue(max_size=10).launch(share=True)
 if __name__ == "__main__":
+    main()