Spaces:

mehdidc
/

text_to_image_ddgan

Runtime error

App Files Files Community

Mehdi Cherti commited on Apr 16, 2023

Commit

27911d6

1 Parent(s): ee8f9c5

simplify

Browse files

Files changed (3) hide show

clip_encoder.py +1 -1
run.py → model_configs.py +36 -93
test_ddgan.py +83 -157

clip_encoder.py CHANGED Viewed

@@ -16,7 +16,7 @@ class CLIPEncoder(nn.Module):
         self.model, _, _ = open_clip.create_model_and_transforms(model, pretrained=pretrained)
         self.output_size = self.model.transformer.width
-    def forward(self, texts, return_only_pooled=True):
         device = next(self.parameters()).device
         toks = open_clip.tokenize(texts).to(device)
         x = self.model.token_embedding(toks)  # [batch_size, n_ctx, d_model]

         self.model, _, _ = open_clip.create_model_and_transforms(model, pretrained=pretrained)
         self.output_size = self.model.transformer.width
+    def forward(self, texts, return_only_pooled=False):
         device = next(self.parameters()).device
         toks = open_clip.tokenize(texts).to(device)
         x = self.model.token_embedding(toks)  # [batch_size, n_ctx, d_model]

run.py → model_configs.py RENAMED Viewed

@@ -10,30 +10,41 @@ def base():
             "n": 8,
         },
         "model":{
-            "dataset" :"wds",
-            "dataset_root": "/p/scratch/ccstdl/cherti1/CC12M/{00000..01099}.tar",
-            "image_size": 256,
             "num_channels": 3,
             "num_channels_dae": 128,
-            "ch_mult": "1 1 2 2 4 4",
-            "num_timesteps": 4,
             "num_res_blocks": 2,
-            "batch_size": 8,
-            "num_epoch": 1000,
-            "ngf": 64,
             "embedding_type": "positional",
-            "use_ema": "",
-            "ema_decay": 0.999,
-            "r1_gamma": 1.0,
             "z_emb_dim": 256,
-            "lr_d": 1e-4,
-            "lr_g": 1.6e-4,
-            "lazy_reg": 10,
-            "save_content": "",
-            "save_ckpt_every": 1,
-            "masked_mean": "",
-            "resume": "",
-        },
     }
 def ddgan_cc12m_v2():
     cfg =  base()
@@ -72,7 +83,7 @@ def ddgan_cc12m_v11():
     cfg = base()
     cfg['model']['text_encoder'] = "google/t5-v1_1-large"
     cfg['model']['classifier_free_guidance_proba'] = 0.2
-    cfg['model']['cross_attention'] = ""
     return cfg
 def ddgan_cc12m_v12():
@@ -102,7 +113,7 @@ def ddgan_cifar10_cond17():
     cfg['model']['image_size'] = 32
     cfg['model']['classifier_free_guidance_proba'] = 0.2
     cfg['model']['ch_mult'] = "1 2 2 2"
-    cfg['model']['cross_attention'] = ""
     cfg['model']['dataset'] = "cifar10"
     cfg['model']['n_mlp'] = 4
     return cfg
@@ -276,7 +287,7 @@ def ddgan_ddb_v7():
 def ddgan_ddb_v9():
     cfg = ddgan_ddb_v3()
-    cfg['model']['attn_resolutions'] = '4 8 16 32'
     return cfg
 def ddgan_laion_aesthetic_v15():
@@ -313,6 +324,7 @@ models = [
     ddgan_cc12m_v13, # T5-XL + cross attention + classifier free guidance + random_resized_crop_v1 + cond attn
     ddgan_cc12m_v14, # T5-XL + cross attention + classifier free guidance + random_resized_crop_v1 + 300M model
     ddgan_cc12m_v15, # fine-tune v11 with --mismatch_loss and --grad_penalty_cond
     ddgan_laion_aesthetic_v1, # like ddgan_cc12m_v11 but fine-tuned on laion aesthetic
     ddgan_laion_aesthetic_v2, # like ddgan_laion_aesthetic_v1 but trained from scratch with the new cross attn discr
     ddgan_laion_aesthetic_v3, # like ddgan_laion_aesthetic_v1 but trained from scratch with T5-XL (continue from 23aug with mismatch and grad penalty and random_resized_crop_v1)
@@ -352,76 +364,7 @@ models = [
     ddgan_ddb_v12,
 ]
-def get_model(model_name):
     for model in models:
         if model.__name__ == model_name:
-            return model()
-def test(model_name, *, cond_text="", batch_size:int=None, epoch:int=None, guidance_scale:float=0, fid=False, real_img_dir="", q=0.0, seed=0, nb_images_for_fid=0, scale_factor_h=1, scale_factor_w=1, compute_clip_score=False, eval_name="", scale_method="convolutional", compute_image_reward=False):
-    cfg = get_model(model_name)
-    model = cfg['model']
-    if epoch is None:
-        paths = glob('./saved_info/dd_gan/{}/{}/netG_*.pth'.format(model["dataset"], model_name))
-        epoch = max(
-            [int(os.path.basename(path).replace(".pth", "").split("_")[1]) for path in paths]
-        )
-    args = {}
-    args['exp'] = model_name
-    args['image_size'] = model['image_size']
-    args['seed'] = seed
-    args['num_channels'] = model['num_channels']
-    args['dataset'] = model['dataset']
-    args['num_channels_dae'] = model['num_channels_dae']
-    args['ch_mult'] = model['ch_mult']
-    args['num_timesteps'] = model['num_timesteps']
-    args['num_res_blocks'] = model['num_res_blocks']
-    args['batch_size'] = model['batch_size'] if batch_size is None else batch_size
-    args['epoch'] = epoch
-    args['cond_text'] = f'"{cond_text}"'
-    args['text_encoder'] = model.get("text_encoder")
-    args['cross_attention'] = model.get("cross_attention")
-    args['guidance_scale'] = guidance_scale
-    args['masked_mean'] = model.get("masked_mean")
-    args['dynamic_thresholding_quantile'] = q
-    args['scale_factor_h'] = scale_factor_h
-    args['scale_factor_w'] = scale_factor_w
-    args['n_mlp'] = model.get("n_mlp")
-    args['scale_method'] = scale_method
-    args['attn_resolutions'] = model.get("attn_resolutions", "16")
-    if fid:
-        args['compute_fid'] = ''
-        args['real_img_dir'] = real_img_dir
-        args['nb_images_for_fid'] = nb_images_for_fid
-    if compute_clip_score:
-        args['compute_clip_score'] = ""
-    if compute_image_reward:
-        args['compute_image_reward'] = ""
-    if eval_name:
-        args["eval_name"] = eval_name
-    cmd = "python -u test_ddgan.py " + " ".join(f"--{k} {v}" for k, v in args.items() if v is not None)
-    print(cmd)
-    call(cmd, shell=True)
-def eval_results(model_name):
-    import pandas as pd
-    rows = []
-    cfg = get_model(model_name)
-    model = cfg['model']
-    paths = glob('./saved_info/dd_gan/{}/{}/fid*.json'.format(model["dataset"], model_name))
-    for path in paths:
-        with open(path, "r") as fd:
-            data = json.load(fd)
-        row = {}
-        row['fid'] = data['fid']
-        row['epoch'] = data['epoch_id']
-        rows.append(row)
-    out = './saved_info/dd_gan/{}/{}/fid.csv'.format(model["dataset"], model_name)
-    df = pd.DataFrame(rows)
-    df.to_csv(out, index=False)
-if __name__ == "__main__":
-    from clize import run
-    run([test, eval_results])

             "n": 8,
         },
         "model":{
+            "dataset": "wds",
+            "seed": 0,
+            "cross_attention": False,
             "num_channels": 3,
+            "centered": True,
+            "use_geometric": False,
+            "beta_min": 0.1,
+            "beta_max": 20.0,
             "num_channels_dae": 128,
+            "n_mlp": 3,
+            "ch_mult": [1, 1, 2, 2, 4, 4],
             "num_res_blocks": 2,
+            "attn_resolutions": (16,),
+            "dropout": 0.0,
+            "resamp_with_conv": True,
+            "conditional": True,
+            "fir": True,
+            "fir_kernel": [1, 3, 3, 1],
+            "skip_rescale": True,
+            "resblock_type": "biggan",
+            "progressive": "none",
+            "progressive_input": "residual",
+            "progressive_combine": "sum",
             "embedding_type": "positional",
+            "fourier_scale": 16.0,
+            "not_use_tanh": False,
+            "image_size": 256,
+            "nz": 100,
+            "num_timesteps": 4,
             "z_emb_dim": 256,
+            "t_emb_dim": 256,
+            "text_encoder": "google/t5-v1_1-base",
+            "masked_mean": True,
+            "cross_attention_block": "basic",
+        }
     }
 def ddgan_cc12m_v2():
     cfg =  base()
     cfg = base()
     cfg['model']['text_encoder'] = "google/t5-v1_1-large"
     cfg['model']['classifier_free_guidance_proba'] = 0.2
+    cfg['model']['cross_attention'] = True
     return cfg
 def ddgan_cc12m_v12():
     cfg['model']['image_size'] = 32
     cfg['model']['classifier_free_guidance_proba'] = 0.2
     cfg['model']['ch_mult'] = "1 2 2 2"
+    cfg['model']['cross_attention'] = True
     cfg['model']['dataset'] = "cifar10"
     cfg['model']['n_mlp'] = 4
     return cfg
 def ddgan_ddb_v9():
     cfg = ddgan_ddb_v3()
+    cfg['model']['attn_resolutions'] = [4, 8, 16, 32]
     return cfg
 def ddgan_laion_aesthetic_v15():
     ddgan_cc12m_v13, # T5-XL + cross attention + classifier free guidance + random_resized_crop_v1 + cond attn
     ddgan_cc12m_v14, # T5-XL + cross attention + classifier free guidance + random_resized_crop_v1 + 300M model
     ddgan_cc12m_v15, # fine-tune v11 with --mismatch_loss and --grad_penalty_cond
     ddgan_laion_aesthetic_v1, # like ddgan_cc12m_v11 but fine-tuned on laion aesthetic
     ddgan_laion_aesthetic_v2, # like ddgan_laion_aesthetic_v1 but trained from scratch with the new cross attn discr
     ddgan_laion_aesthetic_v3, # like ddgan_laion_aesthetic_v1 but trained from scratch with T5-XL (continue from 23aug with mismatch and grad penalty and random_resized_crop_v1)
     ddgan_ddb_v12,
 ]
+def get_model_config(model_name):
     for model in models:
         if model.__name__ == model_name:
+            return model()['model']

test_ddgan.py CHANGED Viewed

@@ -11,8 +11,32 @@ import time
 import os
 import json
 import torchvision
 from score_sde.models.ncsnpp_generator_adagn import NCSNpp
 from encoder import build_encoder
 #%% Diffusion coefficients
 def var_func_vp(t, beta_min, beta_max):
@@ -138,6 +162,12 @@ def sample_from_model(coefficients, generator, n_time, x_init, T, opt, cond=None
     return x
 def sample_from_model_classifier_free_guidance(coefficients, generator, n_time, x_init, T, opt, text_encoder, cond=None, guidance_scale=0):
     x = x_init
     null = text_encoder([""] * len(x_init), return_only_pooled=False)
@@ -353,106 +383,84 @@ def get_fold_unfold(x, kernel_size, stride, split_input_params, uf=1, df=1):  #
     return fold, unfold, normalization, weighting
 #%%
-def sample_and_test(args):
-    torch.manual_seed(args.seed)
-    device = 'cuda:0'
-    text_encoder  =build_encoder(name=args.text_encoder, masked_mean=args.masked_mean).to(device)
-    args.cond_size = text_encoder.output_size
-    if args.dataset == 'cifar10':
-        real_img_dir = 'pytorch_fid/cifar10_train_stat.npy'
-    elif args.dataset == 'celeba_256':
-        real_img_dir = 'pytorch_fid/celeba_256_stat.npy'
-    elif args.dataset == 'lsun':
-        real_img_dir = 'pytorch_fid/lsun_church_stat.npy'
-    else:
-        real_img_dir = args.real_img_dir
-    to_range_0_1 = lambda x: (x + 1.) / 2.
-    print(vars(args))
-    netG = NCSNpp(args).to(device)
     if args.epoch_id == -1:
         epochs = range(1000)
     else:
         epochs = [args.epoch_id]
     if args.compute_image_reward:
-        import ImageReward as RM
         #image_reward = RM.load("ImageReward-v1.0", download_root=".").to(device)
         image_reward = RM.load("ImageReward.pt", download_root=".").to(device)
     for epoch in epochs:
         args.epoch_id = epoch
-        path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(args.dataset, args.exp, args.epoch_id)
-        next_next_path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(args.dataset, args.exp, args.epoch_id+2)
         if not os.path.exists(path):
             continue
         if not os.path.exists(next_next_path):
             break
         print("PATH", path)
-        #if not os.path.exists(next_path):
-        #    print(f"STOP at {epoch}")
-        #    break
-        try:
-            ckpt = torch.load(path, map_location=device)
-        except Exception:
-            continue
         suffix = '_' + args.eval_name if args.eval_name else ""
-        dest = './saved_info/dd_gan/{}/{}/eval_{}{}.json'.format(args.dataset, args.exp, args.epoch_id, suffix)
         if (args.compute_fid or args.compute_clip_score or args.compute_image_reward) and  os.path.exists(dest):
             continue
-        print("Eval Epoch", args.epoch_id)
-        #loading weights from ddp in single gpu
-        #print(ckpt.keys())
-        for key in list(ckpt.keys()):
-            if key.startswith("module"):
-                ckpt[key[7:]] = ckpt.pop(key)
-        netG.load_state_dict(ckpt)
-        netG.eval()
-        T = get_time_schedule(args, device)
-        pos_coeff = Posterior_Coefficients(args, device)
-        save_dir = "./generated_samples/{}".format(args.dataset)
         if not os.path.exists(save_dir):
             os.makedirs(save_dir)
         if args.compute_fid or args.compute_clip_score or args.compute_image_reward:
-            from torch.nn.functional import adaptive_avg_pool2d
-            from pytorch_fid.fid_score import calculate_activation_statistics, calculate_fid_given_paths, ImagePathDataset, compute_statistics_of_path, calculate_frechet_distance
-            from pytorch_fid.inception import InceptionV3
-            import random
             random.seed(args.seed)
             texts = open(args.cond_text).readlines()
             texts = [t.strip() for t in texts]
             if args.nb_images_for_fid:
                 random.shuffle(texts)
                 texts = texts[0:args.nb_images_for_fid]
-            #iters_needed = len(texts) // args.batch_size
-            #texts = list(map(lambda s:s.strip(), texts))
-            #ntimes = max(30000 // len(texts), 1)
-            #texts = texts * ntimes
             print("Text size:", len(texts))
-            #print("Iters:", iters_needed)
             i = 0
             if args.compute_fid:
                 dims = 2048
                 block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
                 inceptionv3 = InceptionV3([block_idx]).to(device)
             if args.compute_clip_score:
-                import clip
                 CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
                 CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
                 clip_model, preprocess = clip.load(args.clip_model, device)
@@ -481,14 +489,14 @@ def sample_and_test(args):
             for b in range(0, len(texts), args.batch_size):
                 text = texts[b:b+args.batch_size]
                 with torch.no_grad():
-                    cond = text_encoder(text, return_only_pooled=False)
                     bs = len(text)
                     t0 = time.time()
-                    x_t_1 = torch.randn(bs, args.num_channels,args.image_size, args.image_size).to(device)
                     if args.guidance_scale:
                         fake_sample = sample_from_model_classifier_free_guidance(pos_coeff, netG, args.num_timesteps, x_t_1,T,  args, text_encoder, cond=cond, guidance_scale=args.guidance_scale)
                     else:
-                        fake_sample = sample_from_model(pos_coeff, netG, args.num_timesteps, x_t_1,T,  args, cond=cond)
                     fake_sample = to_range_0_1(fake_sample)
                     if args.compute_fid:
@@ -513,8 +521,8 @@ def sample_and_test(args):
                             clip_scores.append(((imf * txtf).sum(dim=1)).cpu())
                     if args.compute_image_reward:
-                        for k, sample in enumerate(fake_sample):
-                            img = sample.cpu().numpy().transpose(1,2,0)
                             img = img * 255
                             img = img.astype(np.uint8)
                             text_k = text[k]
@@ -542,7 +550,8 @@ def sample_and_test(args):
             with open(dest, "w") as fd:
                 json.dump(results, fd)
             print(results)
-        else:
             if args.cond_text.endswith(".txt"):
                 texts = open(args.cond_text).readlines()
                 texts = [t.strip() for t in texts]
@@ -550,7 +559,6 @@ def sample_and_test(args):
                 texts = [args.cond_text] * args.batch_size
             clip_guidance = False
             if clip_guidance:
-                from clip_encoder import CLIPImageEncoder
                 cond = text_encoder(texts, return_only_pooled=False)
                 clip_image_model = CLIPImageEncoder().to(device)
                 x_t_1 = torch.randn(len(texts), args.num_channels,args.image_size*args.scale_factor_h, args.image_size*args.scale_factor_w).to(device)
@@ -559,14 +567,14 @@ def sample_and_test(args):
                 torchvision.utils.save_image(fake_sample, './samples_{}.jpg'.format(args.dataset))
             else:
-                cond = text_encoder(texts, return_only_pooled=False)
-                x_t_1 = torch.randn(len(texts), args.num_channels,args.image_size*args.scale_factor_h, args.image_size*args.scale_factor_w).to(device)
                 t0 = time.time()
                 if args.guidance_scale:
                     if args.scale_factor_h > 1 or args.scale_factor_w > 1:
                         if args.scale_method == "convolutional":
                             split_input_params = {
-                                "ks": (args.image_size, args.image_size),
                                 "stride": (150,  150),
                                 "clip_max_tie_weight": 0.5,
                                 "clip_min_tie_weight": 0.01,
@@ -583,22 +591,17 @@ def sample_and_test(args):
                     else:
                         fake_sample = sample_from_model_classifier_free_guidance(pos_coeff, netG, args.num_timesteps, x_t_1,T,  args, text_encoder, cond=cond, guidance_scale=args.guidance_scale)
                 else:
-                    fake_sample = sample_from_model(pos_coeff, netG, args.num_timesteps, x_t_1,T,  args, cond=cond)
                 print(time.time() - t0)
                 fake_sample = to_range_0_1(fake_sample)
-                torchvision.utils.save_image(fake_sample, './samples_{}.jpg'.format(args.dataset))
 if __name__ == '__main__':
     parser = argparse.ArgumentParser('ddgan parameters')
-    parser.add_argument('--seed', type=int, default=1024,
-                        help='seed used for initialization')
     parser.add_argument('--compute_fid', action='store_true', default=False,
                             help='whether or not compute FID')
     parser.add_argument('--compute_clip_score', action='store_true', default=False,
@@ -608,92 +611,15 @@ if __name__ == '__main__':
     parser.add_argument('--clip_model', type=str,default="ViT-L/14")
     parser.add_argument('--eval_name', type=str,default="")
-    parser.add_argument('--epoch_id', type=int,default=1000)
     parser.add_argument('--guidance_scale', type=float,default=0)
     parser.add_argument('--dynamic_thresholding_quantile', type=float,default=0)
-    parser.add_argument('--cond_text', type=str,default="0")
     parser.add_argument('--scale_factor_h', type=int,default=1)
     parser.add_argument('--scale_factor_w', type=int,default=1)
     parser.add_argument('--scale_method', type=str,default="convolutional")
-    parser.add_argument('--cross_attention', action='store_true',default=False)
-    parser.add_argument('--num_channels', type=int, default=3,
-                            help='channel of image')
-    parser.add_argument('--centered', action='store_false', default=True,
-                            help='-1,1 scale')
-    parser.add_argument('--use_geometric', action='store_true',default=False)
-    parser.add_argument('--beta_min', type=float, default= 0.1,
-                            help='beta_min for diffusion')
-    parser.add_argument('--beta_max', type=float, default=20.,
-                            help='beta_max for diffusion')
-    parser.add_argument('--num_channels_dae', type=int, default=128,
-                            help='number of initial channels in denosing model')
-    parser.add_argument('--n_mlp', type=int, default=3,
-                            help='number of mlp layers for z')
-    parser.add_argument('--ch_mult', nargs='+', type=int,
-                            help='channel multiplier')
-    parser.add_argument('--num_res_blocks', type=int, default=2,
-                            help='number of resnet blocks per scale')
-    parser.add_argument('--attn_resolutions', default=(16,), nargs='+', type=int,
-                            help='resolution of applying attention')
-    parser.add_argument('--dropout', type=float, default=0.,
-                            help='drop-out rate')
-    parser.add_argument('--resamp_with_conv', action='store_false', default=True,
-                            help='always up/down sampling with conv')
-    parser.add_argument('--conditional', action='store_false', default=True,
-                            help='noise conditional')
-    parser.add_argument('--fir', action='store_false', default=True,
-                            help='FIR')
-    parser.add_argument('--fir_kernel', default=[1, 3, 3, 1],
-                            help='FIR kernel')
-    parser.add_argument('--skip_rescale', action='store_false', default=True,
-                            help='skip rescale')
-    parser.add_argument('--resblock_type', default='biggan',
-                            help='tyle of resnet block, choice in biggan and ddpm')
-    parser.add_argument('--progressive', type=str, default='none', choices=['none', 'output_skip', 'residual'],
-                            help='progressive type for output')
-    parser.add_argument('--progressive_input', type=str, default='residual', choices=['none', 'input_skip', 'residual'],
-                        help='progressive type for input')
-    parser.add_argument('--progressive_combine', type=str, default='sum', choices=['sum', 'cat'],
-                        help='progressive combine method.')
-    parser.add_argument('--embedding_type', type=str, default='positional', choices=['positional', 'fourier'],
-                        help='type of time embedding')
-    parser.add_argument('--fourier_scale', type=float, default=16.,
-                            help='scale of fourier transform')
-    parser.add_argument('--not_use_tanh', action='store_true',default=False)
-    #geenrator and training
-    parser.add_argument('--exp', default='experiment_cifar_default', help='name of experiment')
-    parser.add_argument('--real_img_dir', default='./pytorch_fid/cifar10_train_stat.npy', help='directory to real images for FID computation')
-    parser.add_argument('--dataset', default='cifar10', help='name of dataset')
-    parser.add_argument('--image_size', type=int, default=32,
-                            help='size of image')
-    parser.add_argument('--nz', type=int, default=100)
-    parser.add_argument('--num_timesteps', type=int, default=4)
-    parser.add_argument('--z_emb_dim', type=int, default=256)
-    parser.add_argument('--t_emb_dim', type=int, default=256)
-    parser.add_argument('--batch_size', type=int, default=200, help='sample generating batch size')
-    parser.add_argument('--text_encoder', type=str, default="google/t5-v1_1-base")
-    parser.add_argument('--masked_mean', action='store_true',default=False)
     parser.add_argument('--nb_images_for_fid', type=int, default=0)
     args = parser.parse_args()
     sample_and_test(args)

 import os
 import json
 import torchvision
+import random
 from score_sde.models.ncsnpp_generator_adagn import NCSNpp
+from torch.nn.functional import adaptive_avg_pool2d
+try:
+    from pytorch_fid.fid_score import calculate_activation_statistics, calculate_fid_given_paths, ImagePathDataset, compute_statistics_of_path, calculate_frechet_distance
+    from pytorch_fid.inception import InceptionV3
+except ImportError:
+    pass
+try:
+    import ImageReward as RM
+except ImportError:
+    pass
+try:
+    import clip
+except ImportError:
+    pass
 from encoder import build_encoder
+from clip_encoder import CLIPImageEncoder
+from model_configs import get_model_config
 #%% Diffusion coefficients
 def var_func_vp(t, beta_min, beta_max):
     return x
+def sample(generator, x_init, cond=None):
+    return sample_from_model(
+        generator.pos_coeff, generator, n_time=generator.config.num_timesteps, x_init=x_init,
+        T=generator.time_schedule, opt=generator.config, cond=cond
+    )
 def sample_from_model_classifier_free_guidance(coefficients, generator, n_time, x_init, T, opt, text_encoder, cond=None, guidance_scale=0):
     x = x_init
     null = text_encoder([""] * len(x_init), return_only_pooled=False)
     return fold, unfold, normalization, weighting
+class ObjectFromDict:
+    def __init__(self, d):
+        self.__dict__ = d
+def load_model(config, path, device="cpu"):
+    config = ObjectFromDict(config)
+    text_encoder = build_encoder(name=config.text_encoder, masked_mean=config.masked_mean)
+    config.cond_size = text_encoder.output_size
+    netG = NCSNpp(config)
+    ckpt = torch.load(path, map_location="cpu")
+    for key in list(ckpt.keys()):
+        if key.startswith("module"):
+            ckpt[key[7:]] = ckpt.pop(key)
+    netG.load_state_dict(ckpt)
+    netG.eval()
+    netG.pos_coeff = Posterior_Coefficients(config, device)
+    netG.text_encoder = text_encoder
+    netG.config = config
+    netG.time_schedule = get_time_schedule(config, device)
+    netG = netG.to(device)
+    return netG
 #%%
+def sample_and_test(args):
+    torch.manual_seed(args.seed)
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    to_range_0_1 = lambda x: (x + 1.) / 2.
     if args.epoch_id == -1:
         epochs = range(1000)
     else:
         epochs = [args.epoch_id]
     if args.compute_image_reward:
         #image_reward = RM.load("ImageReward-v1.0", download_root=".").to(device)
         image_reward = RM.load("ImageReward.pt", download_root=".").to(device)
+    cfg = get_model_config(args.name)
     for epoch in epochs:
         args.epoch_id = epoch
+        path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(cfg['dataset'], args.name, args.epoch_id)
+        next_next_path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(cfg['dataset'], args.name, args.epoch_id+2)
+        print(path)
         if not os.path.exists(path):
             continue
         if not os.path.exists(next_next_path):
             break
         print("PATH", path)
         suffix = '_' + args.eval_name if args.eval_name else ""
+        dest = './saved_info/dd_gan/{}/{}/eval_{}{}.json'.format(cfg['dataset'],'ddgan', args.epoch_id, suffix)
         if (args.compute_fid or args.compute_clip_score or args.compute_image_reward) and  os.path.exists(dest):
             continue
+        print("Load epoch", args.epoch_id, "checkpoint")
+        netG = load_model(cfg, path, device=device)
+        save_dir = "./generated_samples/{}".format(cfg['dataset'])
         if not os.path.exists(save_dir):
             os.makedirs(save_dir)
         if args.compute_fid or args.compute_clip_score or args.compute_image_reward:
+            # Evaluate
             random.seed(args.seed)
             texts = open(args.cond_text).readlines()
             texts = [t.strip() for t in texts]
             if args.nb_images_for_fid:
                 random.shuffle(texts)
                 texts = texts[0:args.nb_images_for_fid]
             print("Text size:", len(texts))
             i = 0
             if args.compute_fid:
                 dims = 2048
                 block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
                 inceptionv3 = InceptionV3([block_idx]).to(device)
             if args.compute_clip_score:
                 CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
                 CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
                 clip_model, preprocess = clip.load(args.clip_model, device)
             for b in range(0, len(texts), args.batch_size):
                 text = texts[b:b+args.batch_size]
                 with torch.no_grad():
+                    cond = netG.text_encoder(text)
                     bs = len(text)
                     t0 = time.time()
+                    x_t_1 = torch.randn(bs, cfg['num_channels'], cfg['image_size'], cfg['image_size']).to(device)
                     if args.guidance_scale:
                         fake_sample = sample_from_model_classifier_free_guidance(pos_coeff, netG, args.num_timesteps, x_t_1,T,  args, text_encoder, cond=cond, guidance_scale=args.guidance_scale)
                     else:
+                        fake_sample = sample(generator=model, x_init=x_init, cond=cond)
                     fake_sample = to_range_0_1(fake_sample)
                     if args.compute_fid:
                             clip_scores.append(((imf * txtf).sum(dim=1)).cpu())
                     if args.compute_image_reward:
+                        for k, img in enumerate(fake_sample):
+                            img = img.cpu().numpy().transpose(1,2,0)
                             img = img * 255
                             img = img.astype(np.uint8)
                             text_k = text[k]
             with open(dest, "w") as fd:
                 json.dump(results, fd)
             print(results)
+        else:
+            # just generate some samples
             if args.cond_text.endswith(".txt"):
                 texts = open(args.cond_text).readlines()
                 texts = [t.strip() for t in texts]
                 texts = [args.cond_text] * args.batch_size
             clip_guidance = False
             if clip_guidance:
                 cond = text_encoder(texts, return_only_pooled=False)
                 clip_image_model = CLIPImageEncoder().to(device)
                 x_t_1 = torch.randn(len(texts), args.num_channels,args.image_size*args.scale_factor_h, args.image_size*args.scale_factor_w).to(device)
                 torchvision.utils.save_image(fake_sample, './samples_{}.jpg'.format(args.dataset))
             else:
+                cond = netG.text_encoder(texts)
+                x_t_1 = torch.randn(len(texts), cfg['num_channels'], cfg['image_size'] * args.scale_factor_h, cfg['image_size'] * args.scale_factor_w).to(device)
                 t0 = time.time()
                 if args.guidance_scale:
                     if args.scale_factor_h > 1 or args.scale_factor_w > 1:
                         if args.scale_method == "convolutional":
                             split_input_params = {
+                                "ks": (cfg['image_size'], cfg['image_size']),
                                 "stride": (150,  150),
                                 "clip_max_tie_weight": 0.5,
                                 "clip_min_tie_weight": 0.01,
                     else:
                         fake_sample = sample_from_model_classifier_free_guidance(pos_coeff, netG, args.num_timesteps, x_t_1,T,  args, text_encoder, cond=cond, guidance_scale=args.guidance_scale)
                 else:
+                    fake_sample = sample(generator=netG, x_init=x_t_1, cond=cond)
                 print(time.time() - t0)
                 fake_sample = to_range_0_1(fake_sample)
+                torchvision.utils.save_image(fake_sample, 'samples.jpg')
 if __name__ == '__main__':
     parser = argparse.ArgumentParser('ddgan parameters')
+    parser.add_argument('--name', type=str, default="", help="model config name")
+    parser.add_argument('--batch_size', type=int, default=16)
+    parser.add_argument('--seed', type=int, default=1024, help='seed used for initialization')
     parser.add_argument('--compute_fid', action='store_true', default=False,
                             help='whether or not compute FID')
     parser.add_argument('--compute_clip_score', action='store_true', default=False,
     parser.add_argument('--clip_model', type=str,default="ViT-L/14")
     parser.add_argument('--eval_name', type=str,default="")
+    parser.add_argument('--epoch_id', type=int,default=-1)
     parser.add_argument('--guidance_scale', type=float,default=0)
     parser.add_argument('--dynamic_thresholding_quantile', type=float,default=0)
+    parser.add_argument('--cond_text', type=str,default="a chair in the form of an avocado")
     parser.add_argument('--scale_factor_h', type=int,default=1)
     parser.add_argument('--scale_factor_w', type=int,default=1)
     parser.add_argument('--scale_method', type=str,default="convolutional")
     parser.add_argument('--nb_images_for_fid', type=int, default=0)
     args = parser.parse_args()
     sample_and_test(args)