Spaces:

harsh99
/

Virtual-Cloths-TryOn

Running

App Files Files Community

harsh99 commited on Jul 23

Commit

76c374a

1 Parent(s): f6fbe2a

masked free model support added.

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +3 -1
.gradio/certificate.pem +31 -0
app.py +430 -0
load_model.py +5 -0
mask-based-output/vitonhd-512/unpaired/00654_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/01265_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/01985_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/02023_00.jpg +0 -0
{output → mask-based-output}/vitonhd-512/unpaired/02532_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/02944_00.jpg +0 -0
{output → mask-based-output}/vitonhd-512/unpaired/03191_00.jpg +0 -0
{output → mask-based-output}/vitonhd-512/unpaired/03921_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/05006_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/05378_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/07342_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/08088_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/08239_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/08650_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/08839_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/11085_00.jpg +0 -0
{output → mask-based-output}/vitonhd-512/unpaired/12345_00.jpg +0 -0
{output → mask-based-output}/vitonhd-512/unpaired/12419_00.jpg +0 -0
{output → mask-based-output}/vitonhd-512/unpaired/12562_00.jpg +0 -0
mask-based-output/vitonhd-512/unpaired/14651_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/00654_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/01265_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/01985_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/02023_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/02532_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/02944_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/03191_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/03921_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/05006_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/05378_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/07342_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/08088_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/08239_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/08650_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/08839_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/11085_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/12345_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/12419_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/12562_00.jpg +0 -0
mask-free-output/vitonhd-512/unpaired/14651_00.jpg +0 -0
sample_inference.ipynb → mask_based_inference.ipynb +93 -81
mask_free_inference.ipynb +449 -0
output/vitonhd-512/unpaired/00654_00.jpg +0 -0
output/vitonhd-512/unpaired/01265_00.jpg +0 -0
output/vitonhd-512/unpaired/01985_00.jpg +0 -0
output/vitonhd-512/unpaired/02023_00.jpg +0 -0

.gitignore CHANGED Viewed

@@ -1,7 +1,9 @@
 *inkpunk-diffusion-v1.ckpt
 *sd-v1-5-inpainting.ckpt
 *zalando-hd-resized.zip
 # *viton-hd-dataset.zip
 viton-hd-dataset/
 checkpoints/

 *inkpunk-diffusion-v1.ckpt
+*instruct-pix2pix*
 *sd-v1-5-inpainting.ckpt
 *zalando-hd-resized.zip
+*finetuned_weights.safetensors
+*maskfree_finetuned_weights.safetensors
 # *viton-hd-dataset.zip
 viton-hd-dataset/
 checkpoints/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import os
+import torch
+import gradio as gr
+from PIL import Image
+import numpy as np
+from typing import Optional
+# Import your custom modules
+from load_model import preload_models_from_standard_weights
+from utils import to_pil_image
+import inspect
+import os
+from typing import Union
+import PIL
+import numpy as np
+import torch
+import tqdm
+from diffusers.utils.torch_utils import randn_tensor
+from utils import (check_inputs_maskfree, get_time_embedding, numpy_to_pil, prepare_image, compute_vae_encodings)
+from ddpm import DDPMSampler
+class CatVTONPix2PixPipeline:
+    def __init__(
+        self,
+        weight_dtype=torch.float32,
+        device='cuda',
+        compile=False,
+        skip_safety_check=True,
+        use_tf32=True,
+        models={},
+    ):
+        self.device = device
+        self.weight_dtype = weight_dtype
+        self.skip_safety_check = skip_safety_check
+        self.models = models
+        self.generator = torch.Generator(device=device)
+        self.noise_scheduler = DDPMSampler(generator=self.generator)
+        self.encoder= models.get('encoder', None)
+        self.decoder= models.get('decoder', None)
+        self.unet=models.get('diffusion', None)
+        # Enable TF32 for faster training on Ampere GPUs
+        if use_tf32:
+            torch.set_float32_matmul_precision("high")
+            torch.backends.cuda.matmul.allow_tf32 = True
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, torch.Tensor],
+        condition_image: Union[PIL.Image.Image, torch.Tensor],
+        num_inference_steps: int = 50,
+        guidance_scale: float = 2.5,
+        height: int = 1024,
+        width: int = 768,
+        generator=None,
+        eta=1.0,
+        **kwargs
+    ):
+        concat_dim = -1  # FIXME: y axis concat
+        # Prepare inputs to Tensor
+        image, condition_image = check_inputs_maskfree(image, condition_image, width, height)
+        # Ensure consistent dtype for all tensors
+        image = prepare_image(image).to(self.device, dtype=self.weight_dtype)
+        condition_image = prepare_image(condition_image).to(self.device, dtype=self.weight_dtype)
+        # Encode the image
+        image_latent = compute_vae_encodings(image, self.encoder)
+        condition_latent = compute_vae_encodings(condition_image, self.encoder)
+        del image, condition_image
+        # Concatenate latents
+        condition_latent_concat = torch.cat([image_latent, condition_latent], dim=concat_dim)
+        # Prepare noise
+        latents = randn_tensor(
+            condition_latent_concat.shape,
+            generator=generator,
+            device=condition_latent_concat.device,
+            dtype=self.weight_dtype,
+        )
+        # Prepare timesteps
+        self.noise_scheduler.set_inference_timesteps(num_inference_steps)
+        timesteps = self.noise_scheduler.timesteps
+        latents = self.noise_scheduler.add_noise(latents, timesteps[0])
+        # Classifier-Free Guidance
+        if do_classifier_free_guidance := (guidance_scale > 1.0):
+            condition_latent_concat = torch.cat(
+                [
+                    torch.cat([image_latent, torch.zeros_like(condition_latent)], dim=concat_dim),
+                    condition_latent_concat,
+                ]
+            )
+        num_warmup_steps = 0  # For simple DDPM, no warmup needed
+        with tqdm.tqdm(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (torch.cat([latents] * 2) if do_classifier_free_guidance else latents)
+                # prepare the input for the inpainting model
+                p2p_latent_model_input = torch.cat([latent_model_input, condition_latent_concat], dim=1)
+                # predict the noise residual
+                timestep = t.repeat(p2p_latent_model_input.shape[0])
+                time_embedding = get_time_embedding(timestep).to(self.device, dtype=self.weight_dtype)
+                noise_pred = self.unet(
+                    p2p_latent_model_input,
+                    time_embedding
+                )
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.noise_scheduler.step(
+                    t, latents, noise_pred
+                )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps):
+                    progress_bar.update()
+        # Decode the final latents
+        latents = latents.split(latents.shape[concat_dim] // 2, dim=concat_dim)[0]
+        image = self.decoder(latents.to(self.device, dtype=self.weight_dtype))
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        image = numpy_to_pil(image)
+        return image
+def load_models():
+    try:
+        print("🚀 Starting model loading process...")
+        # Check CUDA availability
+        cuda_available = torch.cuda.is_available()
+        print(f"CUDA available: {cuda_available}")
+        if cuda_available:
+            print(f"CUDA device: {torch.cuda.get_device_name()}")
+            free_memory = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)
+            print(f"Available CUDA memory: {free_memory / 1e9:.2f} GB")
+        device = "cuda" if cuda_available else "cpu"
+        # Check if model files exist
+        ckpt_path = "instruct-pix2pix-00-22000.ckpt"
+        finetune_path = "maskfree_finetuned_weights.safetensors"
+        if not os.path.exists(ckpt_path):
+            print(f"❌ Checkpoint file not found: {ckpt_path}")
+            return None, None
+        if not os.path.exists(finetune_path):
+            print(f"❌ Finetune weights file not found: {finetune_path}")
+            return None, None
+        print("📦 Loading models from weights...")
+        models = preload_models_from_standard_weights(
+            ckpt_path=ckpt_path,
+            device=device,
+            finetune_weights_path=finetune_path
+        )
+        if not models:
+            print("❌ Failed to load models")
+            return None, None
+        # Convert all models to consistent dtype to avoid mixed precision issues
+        weight_dtype = torch.float32  # Use float32 to avoid dtype mismatch
+        print(f"Converting models to {weight_dtype}...")
+        # Ensure all models use the same dtype
+        for model_name, model in models.items():
+            if model is not None:
+                try:
+                    model = model.to(dtype=weight_dtype)
+                    models[model_name] = model
+                    print(f"✅ {model_name} converted to {weight_dtype}")
+                except Exception as e:
+                    print(f"⚠️ Could not convert {model_name} to {weight_dtype}: {e}")
+        print("🔧 Initializing pipeline...")
+        pipeline = CatVTONPix2PixPipeline(
+            weight_dtype=weight_dtype,
+            device=device,
+            skip_safety_check=True,
+            models=models,
+        )
+        print("✅ Models and pipeline loaded successfully!")
+        return models, pipeline
+    except Exception as e:
+        print(f"❌ Error in load_models: {e}")
+        import traceback
+        traceback.print_exc()
+        return None, None
+def person_example_fn(image_path):
+    """Handle person image examples"""
+    if image_path:
+        return image_path
+    return None
+def create_demo(pipeline=None):
+    """Create the Gradio interface"""
+    def submit_function_p2p(
+        person_image_path: Optional[str],
+        cloth_image_path: Optional[str],
+        num_inference_steps: int = 50,
+        guidance_scale: float = 2.5,
+        seed: int = 42,
+    ) -> Optional[Image.Image]:
+        """Process virtual try-on inference"""
+        try:
+            if not person_image_path or not cloth_image_path:
+                gr.Warning("Please upload both person and cloth images!")
+                return None
+            if not os.path.exists(person_image_path):
+                gr.Error("Person image file not found!")
+                return None
+            if not os.path.exists(cloth_image_path):
+                gr.Error("Cloth image file not found!")
+                return None
+            if pipeline is None:
+                gr.Error("Models not loaded! Please restart the application.")
+                return None
+            # Load images
+            try:
+                person_image = Image.open(person_image_path).convert('RGB')
+                cloth_image = Image.open(cloth_image_path).convert('RGB')
+            except Exception as e:
+                gr.Error(f"Error loading images: {str(e)}")
+                return None
+            # Set up generator
+            generator = torch.Generator(device=pipeline.device)
+            if seed != -1:
+                generator.manual_seed(seed)
+            print("🔄 Processing virtual try-on...")
+            # Run inference
+            with torch.no_grad():
+                results = pipeline(
+                    person_image,
+                    cloth_image,
+                    num_inference_steps=num_inference_steps,
+                    guidance_scale=guidance_scale,
+                    height=512,
+                    width=384,
+                    generator=generator,
+                )
+            # Process results
+            if isinstance(results, list) and len(results) > 0:
+                result = results[0]
+            else:
+                result = results
+            return result
+        except Exception as e:
+            print(f"❌ Error in submit_function_p2p: {e}")
+            import traceback
+            traceback.print_exc()
+            gr.Error(f"Error during inference: {str(e)}")
+            return None
+    # Custom CSS for better styling
+    css = """
+    .gradio-container {
+        max-width: 1200px !important;
+    }
+    .image-container {
+        max-height: 600px;
+    }
+    """
+    with gr.Blocks(css=css, title="Virtual Try-On") as demo:
+        gr.HTML("""
+        <div style="text-align: center; margin-bottom: 20px;">
+            <h1>🧥 Virtual Try-On with CatVTON</h1>
+            <p>Upload a person image and a clothing item to see how they look together!</p>
+        </div>
+        """)
+        with gr.Tab("Mask-Free Virtual Try-On"):
+            with gr.Row():
+                with gr.Column(scale=1, min_width=350):
+                    with gr.Row():
+                        image_path_p2p = gr.Image(
+                            type="filepath",
+                            interactive=True,
+                            visible=False,
+                        )
+                        person_image_p2p = gr.Image(
+                            interactive=True,
+                            label="Person Image",
+                            type="filepath",
+                            elem_classes=["image-container"]
+                        )
+                    with gr.Row():
+                        cloth_image_p2p = gr.Image(
+                            interactive=True,
+                            label="Clothing Image",
+                            type="filepath",
+                            elem_classes=["image-container"]
+                        )
+                    submit_p2p = gr.Button("✨ Generate Try-On", variant="primary", size="lg")
+                    gr.Markdown(
+                        '<center><span style="color: #FF6B6B; font-weight: bold;">⚠️ Click only once and wait for processing!</span></center>'
+                    )
+                    with gr.Accordion("🔧 Advanced Options", open=False):
+                        num_inference_steps_p2p = gr.Slider(
+                            label="Inference Steps",
+                            minimum=10,
+                            maximum=100,
+                            step=5,
+                            value=50,
+                            info="More steps = better quality but slower"
+                        )
+                        guidance_scale_p2p = gr.Slider(
+                            label="Guidance Scale",
+                            minimum=0.0,
+                            maximum=7.5,
+                            step=0.5,
+                            value=2.5,
+                            info="Higher values = stronger conditioning"
+                        )
+                        seed_p2p = gr.Slider(
+                            label="Seed",
+                            minimum=-1,
+                            maximum=10000,
+                            step=1,
+                            value=42,
+                            info="Use -1 for random seed"
+                        )
+                with gr.Column(scale=2, min_width=500):
+                    result_image_p2p = gr.Image(
+                        interactive=False,
+                        label="Result (Person | Clothing | Generated)",
+                        elem_classes=["image-container"]
+                    )
+                    gr.Markdown("""
+                    ### 📋 Instructions:
+                    1. Upload a **person image** (front-facing works best)
+                    2. Upload a **clothing item** you want to try on
+                    3. Adjust advanced settings if needed
+                    4. Click "Generate Try-On" and wait
+                    ### 💡 Tips:
+                    - Use clear, high-resolution images
+                    - Person should be facing forward
+                    - Clothing items work best when laid flat or on a model
+                    - Try different seeds if you're not satisfied with results
+                    """)
+        # Event handlers
+        image_path_p2p.change(
+            person_example_fn,
+            inputs=image_path_p2p,
+            outputs=person_image_p2p
+        )
+        submit_p2p.click(
+            submit_function_p2p,
+            inputs=[
+                person_image_p2p,
+                cloth_image_p2p,
+                num_inference_steps_p2p,
+                guidance_scale_p2p,
+                seed_p2p,
+            ],
+            outputs=result_image_p2p,
+        )
+    return demo
+def app_gradio():
+    """Main application function"""
+    # Load models at startup
+    print("🚀 Loading models...")
+    models, pipeline = load_models()
+    if not models or not pipeline:
+        print("❌ Failed to load models. Please check your model files.")
+        return
+    # Create and launch demo
+    demo = create_demo(pipeline=pipeline)
+    demo.launch(
+        share=True,
+        show_error=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )
+if __name__ == "__main__":
+    app_gradio()

load_model.py CHANGED Viewed

@@ -78,7 +78,12 @@ def load_finetuned_attention_weights(finetune_weights_path, diffusion, device):
 def preload_models_from_standard_weights(ckpt_path, device, finetune_weights_path=None):
     # CatVTON parameters
     in_channels = 9
     out_channels = 4
     state_dict=model_converter.load_from_standard_weights(ckpt_path, device)

 def preload_models_from_standard_weights(ckpt_path, device, finetune_weights_path=None):
     # CatVTON parameters
+    # in_channels: 8 for instruct-pix2pix (masked free), 9 for sd-v1-5-inpainting (masked based)
     in_channels = 9
+    if 'maskfree' in finetune_weights_path or 'mask_free' in finetune_weights_path:
+        in_channels = 8
     out_channels = 4
     state_dict=model_converter.load_from_standard_weights(ckpt_path, device)

mask-based-output/vitonhd-512/unpaired/00654_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/01265_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/01985_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/02023_00.jpg ADDED Viewed

{output → mask-based-output}/vitonhd-512/unpaired/02532_00.jpg RENAMED Viewed

File without changes

mask-based-output/vitonhd-512/unpaired/02944_00.jpg ADDED Viewed

{output → mask-based-output}/vitonhd-512/unpaired/03191_00.jpg RENAMED Viewed

File without changes

{output → mask-based-output}/vitonhd-512/unpaired/03921_00.jpg RENAMED Viewed

File without changes

mask-based-output/vitonhd-512/unpaired/05006_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/05378_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/07342_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/08088_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/08239_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/08650_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/08839_00.jpg ADDED Viewed

mask-based-output/vitonhd-512/unpaired/11085_00.jpg ADDED Viewed

{output → mask-based-output}/vitonhd-512/unpaired/12345_00.jpg RENAMED Viewed

File without changes

{output → mask-based-output}/vitonhd-512/unpaired/12419_00.jpg RENAMED Viewed

File without changes

{output → mask-based-output}/vitonhd-512/unpaired/12562_00.jpg RENAMED Viewed

File without changes

mask-based-output/vitonhd-512/unpaired/14651_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/00654_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/01265_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/01985_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/02023_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/02532_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/02944_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/03191_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/03921_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/05006_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/05378_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/07342_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/08088_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/08239_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/08650_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/08839_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/11085_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/12345_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/12419_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/12562_00.jpg ADDED Viewed

mask-free-output/vitonhd-512/unpaired/14651_00.jpg ADDED Viewed

sample_inference.ipynb → mask_based_inference.ipynb RENAMED Viewed

@@ -28,6 +28,76 @@
   {
    "cell_type": "code",
    "execution_count": 2,
    "id": "bab24c29",
    "metadata": {},
    "outputs": [
@@ -183,77 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "a069151e",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Loaded finetuned weights from finetuned_weights.safetensors\n",
-      "Loading 0.in_proj.weight\n",
-      "Loading 0.out_proj.weight\n",
-      "Loading 0.out_proj.bias\n",
-      "Loading 8.in_proj.weight\n",
-      "Loading 8.out_proj.weight\n",
-      "Loading 8.out_proj.bias\n",
-      "Loading 16.in_proj.weight\n",
-      "Loading 16.out_proj.weight\n",
-      "Loading 16.out_proj.bias\n",
-      "Loading 24.in_proj.weight\n",
-      "Loading 24.out_proj.weight\n",
-      "Loading 24.out_proj.bias\n",
-      "Loading 32.in_proj.weight\n",
-      "Loading 32.out_proj.weight\n",
-      "Loading 32.out_proj.bias\n",
-      "Loading 40.in_proj.weight\n",
-      "Loading 40.out_proj.weight\n",
-      "Loading 40.out_proj.bias\n",
-      "Loading 48.in_proj.weight\n",
-      "Loading 48.out_proj.weight\n",
-      "Loading 48.out_proj.bias\n",
-      "Loading 56.in_proj.weight\n",
-      "Loading 56.out_proj.weight\n",
-      "Loading 56.out_proj.bias\n",
-      "Loading 64.in_proj.weight\n",
-      "Loading 64.out_proj.weight\n",
-      "Loading 64.out_proj.bias\n",
-      "Loading 72.in_proj.weight\n",
-      "Loading 72.out_proj.weight\n",
-      "Loading 72.out_proj.bias\n",
-      "Loading 80.in_proj.weight\n",
-      "Loading 80.out_proj.weight\n",
-      "Loading 80.out_proj.bias\n",
-      "Loading 88.in_proj.weight\n",
-      "Loading 88.out_proj.weight\n",
-      "Loading 88.out_proj.bias\n",
-      "Loading 96.in_proj.weight\n",
-      "Loading 96.out_proj.weight\n",
-      "Loading 96.out_proj.bias\n",
-      "Loading 104.in_proj.weight\n",
-      "Loading 104.out_proj.weight\n",
-      "Loading 104.out_proj.bias\n",
-      "Loading 112.in_proj.weight\n",
-      "Loading 112.out_proj.weight\n",
-      "Loading 112.out_proj.bias\n",
-      "Loading 120.in_proj.weight\n",
-      "Loading 120.out_proj.weight\n",
-      "Loading 120.out_proj.bias\n",
-      "\n",
-      "Attention module weights loaded from {finetune_weights_path} successfully.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import load_model\n",
-    "\n",
-    "models=load_model.preload_models_from_standard_weights(ckpt_path=\"sd-v1-5-inpainting.ckpt\", device=\"cuda\", finetune_weights_path=\"finetuned_weights.safetensors\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
    "id": "a729bf46",
    "metadata": {},
    "outputs": [
@@ -268,15 +268,27 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 50/50 [00:11<00:00,  4.48it/s]\n",
-      "100%|██████████| 50/50 [00:10<00:00,  4.55it/s]\n",
-      "100%|██████████| 50/50 [00:11<00:00,  4.43it/s]\n",
-      "100%|██████████| 50/50 [00:11<00:00,  4.53it/s]\n",
-      "100%|██████████| 50/50 [00:11<00:00,  4.53it/s]\n",
-      "100%|██████████| 50/50 [00:11<00:00,  4.51it/s]\n",
-      "100%|██████████| 50/50 [00:10<00:00,  4.57it/s]\n",
-      "100%|██████████| 50/50 [00:11<00:00,  4.51it/s]\n",
-      " 40%|████      | 8/20 [01:32<02:17, 11.49s/it]"
      ]
     }
    ],
@@ -299,7 +311,7 @@
     "    args.__dict__= {\n",
     "        \"dataset_name\": \"vitonhd\",\n",
     "        \"data_root_path\": \"./sample_dataset\",\n",
-    "        \"output_dir\": \"./output\",\n",
     "        \"seed\": 555,\n",
     "        \"batch_size\": 1,\n",
     "        \"num_inference_steps\": 50,\n",

   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "24bd99d5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded finetuned weights from finetuned_weights.safetensors\n",
+      "Loading 0.in_proj.weight\n",
+      "Loading 0.out_proj.weight\n",
+      "Loading 0.out_proj.bias\n",
+      "Loading 8.in_proj.weight\n",
+      "Loading 8.out_proj.weight\n",
+      "Loading 8.out_proj.bias\n",
+      "Loading 16.in_proj.weight\n",
+      "Loading 16.out_proj.weight\n",
+      "Loading 16.out_proj.bias\n",
+      "Loading 24.in_proj.weight\n",
+      "Loading 24.out_proj.weight\n",
+      "Loading 24.out_proj.bias\n",
+      "Loading 32.in_proj.weight\n",
+      "Loading 32.out_proj.weight\n",
+      "Loading 32.out_proj.bias\n",
+      "Loading 40.in_proj.weight\n",
+      "Loading 40.out_proj.weight\n",
+      "Loading 40.out_proj.bias\n",
+      "Loading 48.in_proj.weight\n",
+      "Loading 48.out_proj.weight\n",
+      "Loading 48.out_proj.bias\n",
+      "Loading 56.in_proj.weight\n",
+      "Loading 56.out_proj.weight\n",
+      "Loading 56.out_proj.bias\n",
+      "Loading 64.in_proj.weight\n",
+      "Loading 64.out_proj.weight\n",
+      "Loading 64.out_proj.bias\n",
+      "Loading 72.in_proj.weight\n",
+      "Loading 72.out_proj.weight\n",
+      "Loading 72.out_proj.bias\n",
+      "Loading 80.in_proj.weight\n",
+      "Loading 80.out_proj.weight\n",
+      "Loading 80.out_proj.bias\n",
+      "Loading 88.in_proj.weight\n",
+      "Loading 88.out_proj.weight\n",
+      "Loading 88.out_proj.bias\n",
+      "Loading 96.in_proj.weight\n",
+      "Loading 96.out_proj.weight\n",
+      "Loading 96.out_proj.bias\n",
+      "Loading 104.in_proj.weight\n",
+      "Loading 104.out_proj.weight\n",
+      "Loading 104.out_proj.bias\n",
+      "Loading 112.in_proj.weight\n",
+      "Loading 112.out_proj.weight\n",
+      "Loading 112.out_proj.bias\n",
+      "Loading 120.in_proj.weight\n",
+      "Loading 120.out_proj.weight\n",
+      "Loading 120.out_proj.bias\n",
+      "\n",
+      "Attention module weights loaded from {finetune_weights_path} successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import load_model\n",
+    "\n",
+    "models=load_model.preload_models_from_standard_weights(ckpt_path=\"sd-v1-5-inpainting.ckpt\", device=\"cuda\", finetune_weights_path=\"finetuned_weights.safetensors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "id": "bab24c29",
    "metadata": {},
    "outputs": [
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "a729bf46",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "100%|██████████| 50/50 [00:07<00:00,  7.04it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.32it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.01it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.82it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.86it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.25it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.24it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.89it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.90it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.02it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.40it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.15it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.79it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.07it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.14it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.32it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.13it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.05it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.06it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.09it/s]\n",
+      "100%|██████████| 20/20 [02:28<00:00,  7.40s/it]\n"
      ]
     }
    ],
     "    args.__dict__= {\n",
     "        \"dataset_name\": \"vitonhd\",\n",
     "        \"data_root_path\": \"./sample_dataset\",\n",
+    "        \"output_dir\": \"./mask-based-output\",\n",
     "        \"seed\": 555,\n",
     "        \"batch_size\": 1,\n",
     "        \"num_inference_steps\": 50,\n",

mask_free_inference.ipynb ADDED Viewed

	@@ -0,0 +1,449 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6d50f66c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model already downloaded.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check if the model is downloaded,  if not download it\n",
+    "import os\n",
+    "if not os.path.exists(\"instruct-pix2pix-00-22000.ckpt\"):\n",
+    "    !wget https://huggingface.co/timbrooks/instruct-pix2pix/resolve/main/instruct-pix2pix-00-22000.ckpt\n",
+    "else:\n",
+    "    print(\"Model already downloaded.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3598a305",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded finetuned weights from maskfree_finetuned_weights.safetensors\n",
+      "Loading 0.in_proj.weight\n",
+      "Loading 0.out_proj.weight\n",
+      "Loading 0.out_proj.bias\n",
+      "Loading 8.in_proj.weight\n",
+      "Loading 8.out_proj.weight\n",
+      "Loading 8.out_proj.bias\n",
+      "Loading 16.in_proj.weight\n",
+      "Loading 16.out_proj.weight\n",
+      "Loading 16.out_proj.bias\n",
+      "Loading 24.in_proj.weight\n",
+      "Loading 24.out_proj.weight\n",
+      "Loading 24.out_proj.bias\n",
+      "Loading 32.in_proj.weight\n",
+      "Loading 32.out_proj.weight\n",
+      "Loading 32.out_proj.bias\n",
+      "Loading 40.in_proj.weight\n",
+      "Loading 40.out_proj.weight\n",
+      "Loading 40.out_proj.bias\n",
+      "Loading 48.in_proj.weight\n",
+      "Loading 48.out_proj.weight\n",
+      "Loading 48.out_proj.bias\n",
+      "Loading 56.in_proj.weight\n",
+      "Loading 56.out_proj.weight\n",
+      "Loading 56.out_proj.bias\n",
+      "Loading 64.in_proj.weight\n",
+      "Loading 64.out_proj.weight\n",
+      "Loading 64.out_proj.bias\n",
+      "Loading 72.in_proj.weight\n",
+      "Loading 72.out_proj.weight\n",
+      "Loading 72.out_proj.bias\n",
+      "Loading 80.in_proj.weight\n",
+      "Loading 80.out_proj.weight\n",
+      "Loading 80.out_proj.bias\n",
+      "Loading 88.in_proj.weight\n",
+      "Loading 88.out_proj.weight\n",
+      "Loading 88.out_proj.bias\n",
+      "Loading 96.in_proj.weight\n",
+      "Loading 96.out_proj.weight\n",
+      "Loading 96.out_proj.bias\n",
+      "Loading 104.in_proj.weight\n",
+      "Loading 104.out_proj.weight\n",
+      "Loading 104.out_proj.bias\n",
+      "Loading 112.in_proj.weight\n",
+      "Loading 112.out_proj.weight\n",
+      "Loading 112.out_proj.bias\n",
+      "Loading 120.in_proj.weight\n",
+      "Loading 120.out_proj.weight\n",
+      "Loading 120.out_proj.bias\n",
+      "\n",
+      "Attention module weights loaded from {finetune_weights_path} successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import load_model\n",
+    "\n",
+    "models=load_model.preload_models_from_standard_weights(ckpt_path=\"instruct-pix2pix-00-22000.ckpt\", device=\"cuda\", finetune_weights_path=\"maskfree_finetuned_weights.safetensors\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "78e3d8b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/mahesh/miniconda3/envs/harsh/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import inspect\n",
+    "import os\n",
+    "from typing import Union\n",
+    "\n",
+    "import PIL\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import tqdm\n",
+    "from diffusers.utils.torch_utils import randn_tensor\n",
+    "\n",
+    "from utils import (check_inputs_maskfree, get_time_embedding, numpy_to_pil, prepare_image, compute_vae_encodings)\n",
+    "from ddpm import DDPMSampler\n",
+    "\n",
+    "class CatVTONPix2PixPipeline:\n",
+    "    def __init__(\n",
+    "        self, \n",
+    "        weight_dtype=torch.float32,\n",
+    "        device='cuda',\n",
+    "        compile=False,\n",
+    "        skip_safety_check=True,\n",
+    "        use_tf32=True,\n",
+    "        models={},\n",
+    "    ):\n",
+    "        self.device = device\n",
+    "        self.weight_dtype = weight_dtype\n",
+    "        self.skip_safety_check = skip_safety_check\n",
+    "        self.models = models\n",
+    "\n",
+    "        self.generator = torch.Generator(device=device)\n",
+    "        self.noise_scheduler = DDPMSampler(generator=self.generator)\n",
+    "        # self.vae = AutoencoderKL.from_pretrained(\"stabilityai/sd-vae-ft-mse\").to(device, dtype=weight_dtype)\n",
+    "        self.encoder= models.get('encoder', None)\n",
+    "        self.decoder= models.get('decoder', None)\n",
+    " \n",
+    "        self.unet=models.get('diffusion', None)  \n",
+    "        # # Enable TF32 for faster training on Ampere GPUs (A100 and RTX 30 series).\n",
+    "        if use_tf32:\n",
+    "            torch.set_float32_matmul_precision(\"high\")\n",
+    "            torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "\n",
+    "    @torch.no_grad()\n",
+    "    def __call__(\n",
+    "        self, \n",
+    "        image: Union[PIL.Image.Image, torch.Tensor],\n",
+    "        condition_image: Union[PIL.Image.Image, torch.Tensor],\n",
+    "        num_inference_steps: int = 50,\n",
+    "        guidance_scale: float = 2.5,\n",
+    "        height: int = 1024,\n",
+    "        width: int = 768,\n",
+    "        generator=None,\n",
+    "        eta=1.0,\n",
+    "        **kwargs\n",
+    "    ):\n",
+    "        concat_dim = -1  # FIXME: y axis concat\n",
+    "        # Prepare inputs to Tensor\n",
+    "        image, condition_image = check_inputs_maskfree(image, condition_image, width, height)\n",
+    "        \n",
+    "        image = prepare_image(image).to(self.device, dtype=self.weight_dtype)\n",
+    "        \n",
+    "        condition_image = prepare_image(condition_image).to(self.device, dtype=self.weight_dtype)\n",
+    "        \n",
+    "        # Encode the image\n",
+    "        image_latent = compute_vae_encodings(image, self.encoder)\n",
+    "        condition_latent = compute_vae_encodings(condition_image, self.encoder)\n",
+    "        \n",
+    "        del image, condition_image\n",
+    "        # Concatenate latents\n",
+    "        # Concatenate latents\n",
+    "        condition_latent_concat = torch.cat([image_latent, condition_latent], dim=concat_dim)\n",
+    "        # Prepare noise\n",
+    "        latents = randn_tensor(\n",
+    "            condition_latent_concat.shape,\n",
+    "            generator=generator,\n",
+    "            device=condition_latent_concat.device,\n",
+    "            dtype=self.weight_dtype,\n",
+    "        )\n",
+    "        # Prepare timesteps\n",
+    "        self.noise_scheduler.set_inference_timesteps(num_inference_steps)\n",
+    "        timesteps = self.noise_scheduler.timesteps\n",
+    "        # latents = latents * self.noise_scheduler.init_noise_sigma\n",
+    "        latents = self.noise_scheduler.add_noise(latents, timesteps[0])\n",
+    "        \n",
+    "        # Classifier-Free Guidance\n",
+    "        if do_classifier_free_guidance := (guidance_scale > 1.0):\n",
+    "            condition_latent_concat = torch.cat(\n",
+    "                [\n",
+    "                    torch.cat([image_latent, torch.zeros_like(condition_latent)], dim=concat_dim),\n",
+    "                    condition_latent_concat,\n",
+    "                ]\n",
+    "            )\n",
+    "\n",
+    "        num_warmup_steps = 0  # For simple DDPM, no warmup needed\n",
+    "        with tqdm(total=num_inference_steps) as progress_bar:\n",
+    "            for i, t in enumerate(timesteps):\n",
+    "                # expand the latents if we are doing classifier free guidance\n",
+    "                \n",
+    "                latent_model_input = (torch.cat([latents] * 2) if do_classifier_free_guidance else latents)\n",
+    "\n",
+    "                # prepare the input for the inpainting model\n",
+    "                \n",
+    "                p2p_latent_model_input = torch.cat([latent_model_input, condition_latent_concat], dim=1)\n",
+    "                # predict the noise residual\n",
+    "                \n",
+    "                timestep = t.repeat(p2p_latent_model_input.shape[0])\n",
+    "                time_embedding = get_time_embedding(timestep).to(self.device, dtype=self.weight_dtype)\n",
+    "\n",
+    "                noise_pred = self.unet(\n",
+    "                    p2p_latent_model_input,\n",
+    "                    time_embedding\n",
+    "                )\n",
+    "                # perform guidance\n",
+    "                if do_classifier_free_guidance:\n",
+    "                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)\n",
+    "                    noise_pred = noise_pred_uncond + guidance_scale * (\n",
+    "                        noise_pred_text - noise_pred_uncond\n",
+    "                    )\n",
+    "                # compute the previous noisy sample x_t -> x_t-1\n",
+    "                latents = self.noise_scheduler.step(\n",
+    "                    t, latents, noise_pred\n",
+    "                )\n",
+    "                # call the callback, if provided\n",
+    "                if i == len(timesteps) - 1 or (\n",
+    "                    (i + 1) > num_warmup_steps\n",
+    "                ):\n",
+    "                    progress_bar.update()\n",
+    "\n",
+    "        # Decode the final latents\n",
+    "        latents = latents.split(latents.shape[concat_dim] // 2, dim=concat_dim)[0]\n",
+    "        # latents = 1 / self.vae.config.scaling_factor * latents\n",
+    "        # image = self.vae.decode(latents.to(self.device, dtype=self.weight_dtype)).sample\n",
+    "        image = self.decoder(latents.to(self.device, dtype=self.weight_dtype))\n",
+    "        image = (image / 2 + 0.5).clamp(0, 1)\n",
+    "        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16\n",
+    "        image = image.cpu().permute(0, 2, 3, 1).float().numpy()\n",
+    "        image = numpy_to_pil(image)\n",
+    "        \n",
+    "        return image\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5627b2d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset vitonhd loaded, total 20 pairs.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 50/50 [00:07<00:00,  7.12it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.31it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.09it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.98it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.01it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.13it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.28it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  7.13it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.17it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.97it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.17it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.38it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.20it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.92it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.71it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.25it/s]\n",
+      "100%|██████████| 50/50 [00:06<00:00,  7.49it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.87it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.89it/s]\n",
+      "100%|██████████| 50/50 [00:07<00:00,  6.92it/s]\n",
+      "100%|██████████| 20/20 [02:26<00:00,  7.35s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import argparse\n",
+    "from torch.utils.data import DataLoader\n",
+    "from VITON_Dataset import VITONHDTestDataset\n",
+    "from tqdm import tqdm\n",
+    "from PIL import Image\n",
+    "\n",
+    "from utils import to_pil_image\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def main():\n",
+    "    args=argparse.Namespace()\n",
+    "    args.__dict__= {\n",
+    "        \"dataset_name\": \"vitonhd\",\n",
+    "        \"data_root_path\": \"./sample_dataset\",\n",
+    "        \"output_dir\": \"./mask-free-output\",\n",
+    "        \"seed\": 555,\n",
+    "        \"batch_size\": 1,\n",
+    "        \"num_inference_steps\": 50,\n",
+    "        \"guidance_scale\": 2.5,\n",
+    "        \"width\": 384,\n",
+    "        \"height\": 512,\n",
+    "        \"eval_pair\": False,\n",
+    "        \"concat_eval_results\": True,\n",
+    "        \"allow_tf32\": True,\n",
+    "        \"dataloader_num_workers\": 4,\n",
+    "        \"mixed_precision\": 'no',\n",
+    "        \"concat_axis\": 'y',\n",
+    "        \"enable_condition_noise\": True,\n",
+    "        \"is_train\": False\n",
+    "    }\n",
+    "\n",
+    "    # Pipeline\n",
+    "    pipeline = CatVTONPix2PixPipeline(\n",
+    "        weight_dtype={\n",
+    "            \"no\": torch.float32,\n",
+    "            \"fp16\": torch.float16,\n",
+    "            \"bf16\": torch.bfloat16,\n",
+    "        }[args.mixed_precision],\n",
+    "        device=\"cuda\",\n",
+    "        skip_safety_check=True,\n",
+    "        models=models,\n",
+    "    )\n",
+    "    # Dataset\n",
+    "    if args.dataset_name == \"vitonhd\":\n",
+    "        dataset = VITONHDTestDataset(args)\n",
+    "    else:\n",
+    "        raise ValueError(f\"Invalid dataset name {args.dataset}.\")\n",
+    "    print(f\"Dataset {args.dataset_name} loaded, total {len(dataset)} pairs.\")\n",
+    "    dataloader = DataLoader(\n",
+    "        dataset,\n",
+    "        batch_size=args.batch_size,\n",
+    "        shuffle=False,\n",
+    "        num_workers=args.dataloader_num_workers\n",
+    "    )\n",
+    "        \n",
+    "    # Inference\n",
+    "    generator = torch.Generator(device='cuda').manual_seed(args.seed)\n",
+    "    args.output_dir = os.path.join(args.output_dir, f\"{args.dataset_name}-{args.height}\", \"paired\" if args.eval_pair else \"unpaired\")\n",
+    "    if not os.path.exists(args.output_dir):\n",
+    "        os.makedirs(args.output_dir)\n",
+    "        \n",
+    "    for batch in tqdm(dataloader):\n",
+    "        person_images = batch['person']\n",
+    "        cloth_images = batch['cloth']\n",
+    "\n",
+    "        results = pipeline(\n",
+    "            person_images,\n",
+    "            cloth_images,\n",
+    "            num_inference_steps=args.num_inference_steps,\n",
+    "            guidance_scale=args.guidance_scale,\n",
+    "            height=args.height,\n",
+    "            width=args.width,\n",
+    "            generator=generator,\n",
+    "        )\n",
+    "        \n",
+    "        if args.concat_eval_results:\n",
+    "            person_images = to_pil_image(person_images)\n",
+    "            cloth_images = to_pil_image(cloth_images)\n",
+    "        for i, result in enumerate(results):\n",
+    "            person_name = batch['person_name'][i]\n",
+    "            output_path = os.path.join(args.output_dir, person_name)\n",
+    "            if not os.path.exists(os.path.dirname(output_path)):\n",
+    "                os.makedirs(os.path.dirname(output_path))\n",
+    "            if args.concat_eval_results:\n",
+    "                w, h = result.size\n",
+    "                concated_result = Image.new('RGB', (w*3, h))\n",
+    "                concated_result.paste(person_images[i], (0, 0))\n",
+    "                concated_result.paste(cloth_images[i], (w, 0))  \n",
+    "                concated_result.paste(result, (w*2, 0))\n",
+    "                result = concated_result\n",
+    "            result.save(output_path)\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39537851",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22fb6113",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c374cc6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bddce5df",
+   "metadata": {
+    "vscode": {
+     "languageId": "markdown"
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "harsh",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

output/vitonhd-512/unpaired/00654_00.jpg DELETED Viewed

Binary file (67.8 kB)

output/vitonhd-512/unpaired/01265_00.jpg DELETED Viewed

Binary file (35.2 kB)

output/vitonhd-512/unpaired/01985_00.jpg DELETED Viewed

Binary file (42.8 kB)

output/vitonhd-512/unpaired/02023_00.jpg DELETED Viewed

Binary file (40.4 kB)