Spaces:

rynmurdock
/

Babel

Runtime error

App Files Files Community

rynmurdock commited on May 6, 2024

Commit

35ef920

1 Parent(s): c5ca37a

?

Browse files

Files changed (7) hide show

app.py +207 -68
checkpoint-31250/checkpoint-decoder-31250/pytorch_model.bin +1 -1
checkpoint-31250/checkpoint-decoder-31250/training_decoder_args.bin +2 -2
checkpoint-31250/checkpoint-encoder-31250/pytorch_model.bin +1 -1
checkpoint-31250/checkpoint-encoder-31250/training_encoder_args.bin +2 -2
checkpoint-31250/checkpoint-full-31250/training.bin +2 -2
real_im_emb_plot.jpg +0 -0

app.py CHANGED Viewed

@@ -7,53 +7,194 @@ Original file is located at
     https://colab.research.google.com/drive/1I47sLakpuwERGzn-XoNct67mwiDS1mQD
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 torch.set_float32_matmul_precision('high')
 from tqdm import tqdm
-from transformers import AutoTokenizer, AutoModelForCausalLM
-class BottleneckT5Autoencoder:
-    def __init__(self, model_path: str, device='cuda'):
-        self.device = device
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=512, torch_dtype=torch.bfloat16)
-        self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(self.device)
-        self.model.eval()
-        # self.model = torch.compile(self.model)
-    def embed(self, text: str) -> torch.FloatTensor:
-        inputs = self.tokenizer(text, return_tensors='pt', padding=True).to(self.device)
-        decoder_inputs = self.tokenizer('', return_tensors='pt').to(self.device)
-        return self.model(
-            **inputs,
-            decoder_input_ids=decoder_inputs['input_ids'],
-            encode_only=True,
-        )
-    def generate_from_latent(self, latent: torch.FloatTensor, max_length=512, temperature=1., top_p=.8, length_penalty=10, min_new_tokens=30) -> str:
-        dummy_text = '.'
-        dummy = self.embed(dummy_text)
-        perturb_vector = latent - dummy
-        self.model.perturb_vector = perturb_vector
-        input_ids = self.tokenizer(dummy_text, return_tensors='pt').to(self.device).input_ids
-        output = self.model.generate(
-            input_ids=input_ids,
-            max_length=max_length,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            num_return_sequences=1,
-            length_penalty=length_penalty,
-            min_new_tokens=min_new_tokens,
-            # num_beams=8,
-        )
-        return self.tokenizer.decode(output[0], skip_special_tokens=True)
-autoencoder = BottleneckT5Autoencoder(model_path='thesephist/contra-bottleneck-t5-xl-wikipedia')
 import gradio as gr
 import numpy as np
@@ -64,7 +205,7 @@ import pandas as pd
 import random
 import time
 dtype = torch.bfloat16
 torch.set_grad_enabled(False)
@@ -80,13 +221,20 @@ start_time = time.time()
 def generate(prompt, in_embs=None,):
   if prompt != '':
     print(prompt)
-    in_embs = in_embs / in_embs.abs().max() * .15 if in_embs != None else None
-    in_embs = .9 * in_embs.to('cuda') + .5 * autoencoder.embed(prompt).to('cuda') if in_embs != None else autoencoder.embed(prompt).to('cuda')
   else:
     print('From embeds.')
-  in_embs = in_embs / in_embs.abs().max() * .15
-  text = autoencoder.generate_from_latent(in_embs.to('cuda'), temperature=.3, top_p=.99, min_new_tokens=5)
-  in_embs = autoencoder.embed(prompt)
   return text, in_embs.to('cpu')
@@ -103,7 +251,6 @@ def next_one(embs, ys, calibrate_prompts):
         if len(calibrate_prompts) > 0:
             print('######### Calibrating with sample prompts #########')
             prompt = calibrate_prompts.pop(0)
-            print(prompt)
             text, img_embs = generate(prompt)
             embs += img_embs
             print(len(embs))
@@ -114,12 +261,12 @@ def next_one(embs, ys, calibrate_prompts):
             # handle case where every instance of calibration prompts is 'Neither' or 'Like' or 'Dislike'
             if len(list(set(ys))) <= 1:
-                embs.append(.01*torch.randn(2048))
-                embs.append(.01*torch.randn(2048))
                 ys.append(0)
                 ys.append(1)
             if len(list(ys)) < 10:
-                embs += [.01*torch.randn(2048)] * 3
                 ys += [0] * 3
             pos_indices = [i for i in range(len(embs)) if ys[i] == 1]
@@ -129,13 +276,6 @@ def next_one(embs, ys, calibrate_prompts):
             random.shuffle(pos_indices)
             random.shuffle(neg_indices)
-            #if len(pos_indices) - len(neg_indices) > 48 and len(pos_indices) > 80:
-            #    pos_indices = pos_indices[32:]
-            if len(neg_indices) - len(pos_indices) > 48/16 and len(pos_indices) > 6:
-                pos_indices = pos_indices[5:]
-            if len(neg_indices) - len(pos_indices) > 48/16 and len(neg_indices) > 6:
-                neg_indices = neg_indices[5:]
             if len(neg_indices) > 25:
                 neg_indices = neg_indices[1:]
@@ -150,17 +290,17 @@ def next_one(embs, ys, calibrate_prompts):
             indices = list(range(len(embs)))
             # also add the latest 0 and the latest 1
-            has_0 = False
-            has_1 = False
-            for i in reversed(range(len(ys))):
-                if ys[i] == 0 and has_0 == False:
-                    indices.append(i)
-                    has_0 = True
-                elif ys[i] == 1 and has_1 == False:
-                    indices.append(i)
-                    has_1 = True
-                if has_0 and has_1:
-                    break
             # we may have just encountered a rare multi-threading diffusers issue (https://github.com/huggingface/diffusers/issues/5749);
             # this ends up adding a rating but losing an embedding, it seems.
@@ -177,7 +317,6 @@ def next_one(embs, ys, calibrate_prompts):
             print('Gathering coefficients')
             lin_class = SVC(max_iter=50000, kernel='linear', class_weight='balanced', C=.1).fit(feature_embs, chosen_y)
             coef_ = torch.tensor(lin_class.coef_, dtype=torch.double)
-            coef_ = coef_ / coef_.abs().max() * 3
             print(coef_.shape, 'COEF')
             print('Gathered')

     https://colab.research.google.com/drive/1I47sLakpuwERGzn-XoNct67mwiDS1mQD
 """
+import matplotlib.pyplot as plt
+import matplotlib
+import argparse
+import glob
+import logging
+import os
+import pickle
+import random
+import torch
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm, trange
+from types import SimpleNamespace
+import sys
+sys.path.append('./Optimus/code/examples/big_ae/')
+sys.path.append('./Optimus/code/')
+from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, BertConfig
+from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForLatentConnector
+from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from pytorch_transformers import BertForLatentConnector, BertTokenizer
+from modules import VAE
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 torch.set_float32_matmul_precision('high')
 from tqdm import tqdm
+################################################
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (vocabulary size)
+            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+    return logits
+def sample_sequence_conditional(model, length, context, past=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, device='cpu', decoder_tokenizer=None):
+    context = torch.tensor(context, dtype=torch.long, device=device)
+    context = context.unsqueeze(0).repeat(num_samples, 1)
+    generated = context
+    with torch.no_grad():
+        while True:
+        # for _ in trange(length):
+            inputs = {'input_ids': generated, 'past': past}
+            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
+            next_token_logits = outputs[0][0, -1, :] / temperature
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
+            # pdb.set_trace()
+            if next_token.unsqueeze(0)[0,0].item() == decoder_tokenizer.encode('<EOS>')[0]:
+                break
+    return generated
+def latent_code_from_text(text,):# args):
+    tokenized1 = tokenizer_encoder.encode(text)
+    tokenized1 = [101] + tokenized1 + [102]
+    coded1 = torch.Tensor([tokenized1])
+    coded1 =torch.Tensor.long(coded1)
+    with torch.no_grad():
+        x0 = coded1
+        x0 = x0.to('cuda')
+        pooled_hidden_fea = model_vae.encoder(x0, attention_mask=(x0 > 0).float())[1]
+        mean, logvar = model_vae.encoder.linear(pooled_hidden_fea).chunk(2, -1)
+        latent_z = mean.squeeze(1)
+        coded_length = len(tokenized1)
+        return latent_z, coded_length
+# args
+def text_from_latent_code(latent_z):
+    past = latent_z
+    context_tokens = tokenizer_decoder.encode('<BOS>')
+    length = 128 # maximum length, but not used
+    out = sample_sequence_conditional(
+        model=model_vae.decoder,
+        context=context_tokens,
+        past=past,
+        length= length, # Chunyuan: Fix length; or use <EOS> to complete a sentence
+        temperature=.5,
+        top_k=100,
+        top_p=.95,
+        device='cuda',
+        decoder_tokenizer = tokenizer_decoder
+    )
+    text_x1 = tokenizer_decoder.decode(out[0,:].tolist(), clean_up_tokenization_spaces=True)
+    text_x1 = text_x1.split()[1:-1]
+    text_x1 = ' '.join(text_x1)
+    return text_x1
+################################################
+# Load model
+MODEL_CLASSES = {
+    'gpt2': (GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer),
+    'bert': (BertConfig, BertForLatentConnector, BertTokenizer)
+}
+latent_size = 768
+model_path = './checkpoint-31250/checkpoint-full-31250/'
+encoder_path = './checkpoint-31250/checkpoint-encoder-31250/'
+decoder_path = './checkpoint-31250/checkpoint-decoder-31250/'
+block_size = 100
+# Load a trained Encoder model and vocabulary that you have fine-tuned
+encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES['bert']
+model_encoder = encoder_model_class.from_pretrained(encoder_path, latent_size=latent_size)
+tokenizer_encoder = encoder_tokenizer_class.from_pretrained('bert-base-cased', do_lower_case=True)
+model_encoder.to('cuda')
+if block_size <= 0:
+    block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
+block_size = min(block_size, tokenizer_encoder.max_len_single_sentence)
+# Load a trained Decoder model and vocabulary that you have fine-tuned
+decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES['gpt2']
+model_decoder = decoder_model_class.from_pretrained(decoder_path, latent_size=latent_size)
+tokenizer_decoder = decoder_tokenizer_class.from_pretrained('gpt2', do_lower_case=False)
+model_decoder.to('cuda')
+if block_size <= 0:
+    block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
+block_size = min(block_size, tokenizer_decoder.max_len_single_sentence)
+# Load full model
+output_full_dir = '/home/ryn_mote/Misc/generative_recommender/text_space/'
+checkpoint = torch.load(os.path.join(model_path, 'training.bin'))
+# Chunyuan: Add Padding token to GPT2
+special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'}
+num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)
+print('We have added', num_added_toks, 'tokens to GPT2')
+model_decoder.resize_token_embeddings(len(tokenizer_decoder))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+assert tokenizer_decoder.pad_token == '<PAD>'
+# Evaluation
+model_vae = VAE(model_encoder, model_decoder, tokenizer_encoder, tokenizer_decoder, SimpleNamespace(**{'latent_size': latent_size, 'device':'cuda'}))
+model_vae.load_state_dict(checkpoint['model_state_dict'])
+print("Pre-trained Optimus is successfully loaded")
+model_vae.to('cuda').to(torch.bfloat16)
+model_vae = torch.compile(model_vae)
+l = latent_code_from_text('A photo of a mountain.')[0]
+t = text_from_latent_code(l)
+print(t, l, l.shape)
+################################################
 import gradio as gr
 import numpy as np
 import random
 import time
 dtype = torch.bfloat16
 torch.set_grad_enabled(False)
 def generate(prompt, in_embs=None,):
   if prompt != '':
     print(prompt)
+    in_embs = in_embs / in_embs.abs().max() * .6 if in_embs != None else None
+    in_embs = 1 * in_embs.to('cuda') + 1 * latent_code_from_text(prompt)[0] if in_embs != None else latent_code_from_text(prompt)[0]
   else:
     print('From embeds.')
+  in_embs = in_embs / in_embs.abs().max() * .6
+  in_embs = in_embs.to('cuda').to(torch.bfloat16)
+  plt.close('all')
+  plt.hist(np.array(in_embs.detach().to('cpu').to(torch.float)).flatten(), bins=5)
+  plt.savefig('real_im_emb_plot.jpg')
+  text = text_from_latent_code(in_embs).replace('<unk> ', '')
+  in_embs = latent_code_from_text(text)[0]
+  print(text)
   return text, in_embs.to('cpu')
         if len(calibrate_prompts) > 0:
             print('######### Calibrating with sample prompts #########')
             prompt = calibrate_prompts.pop(0)
             text, img_embs = generate(prompt)
             embs += img_embs
             print(len(embs))
             # handle case where every instance of calibration prompts is 'Neither' or 'Like' or 'Dislike'
             if len(list(set(ys))) <= 1:
+                embs.append(.01*torch.randn(latent_size))
+                embs.append(.01*torch.randn(latent_size))
                 ys.append(0)
                 ys.append(1)
             if len(list(ys)) < 10:
+                embs += [.01*torch.randn(latent_size)] * 3
                 ys += [0] * 3
             pos_indices = [i for i in range(len(embs)) if ys[i] == 1]
             random.shuffle(pos_indices)
             random.shuffle(neg_indices)
             if len(neg_indices) > 25:
                 neg_indices = neg_indices[1:]
             indices = list(range(len(embs)))
             # also add the latest 0 and the latest 1
+            #has_0 = False
+            #has_1 = False
+            #for i in reversed(range(len(ys))):
+            #    if ys[i] == 0 and has_0 == False:
+            #        indices.append(i)
+            #        has_0 = True
+            #    elif ys[i] == 1 and has_1 == False:
+            #        indices.append(i)
+            #        has_1 = True
+            #    if has_0 and has_1:
+            #        break
             # we may have just encountered a rare multi-threading diffusers issue (https://github.com/huggingface/diffusers/issues/5749);
             # this ends up adding a rating but losing an embedding, it seems.
             print('Gathering coefficients')
             lin_class = SVC(max_iter=50000, kernel='linear', class_weight='balanced', C=.1).fit(feature_embs, chosen_y)
             coef_ = torch.tensor(lin_class.coef_, dtype=torch.double)
             print(coef_.shape, 'COEF')
             print('Gathered')

checkpoint-31250/checkpoint-decoder-31250/pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44191a9d774bb47ee02b1fbe38769fc4a68f25e373ef13c7b14b0fb7a721c8ed
 size 578805986

 version https://git-lfs.github.com/spec/v1
+oid sha256:956e4d5b697320e6edce57414e379130230773a06073ac61e234148a8b4bbf5d
 size 578805986

checkpoint-31250/checkpoint-decoder-31250/training_decoder_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a79ba5f6ca41deaf9b59596e4f06f9969dd15d3b887a93c03d7c3cb5584e00a
-size 2338

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d81aab70fe9efffb1a6897b867bc45772a53476b746b8ab650150d7c7cd22a7
+size 2337

checkpoint-31250/checkpoint-encoder-31250/pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20cbd92cce963a406748b2b0532c1a00f2f8bb12da9340f5978640bbf24b42e2
 size 438007669

 version https://git-lfs.github.com/spec/v1
+oid sha256:12c72c37c42dc4b47d60e1f2cde70225c777927b52aaed16c21f75213eedf11a
 size 438007669

checkpoint-31250/checkpoint-encoder-31250/training_encoder_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1a79ba5f6ca41deaf9b59596e4f06f9969dd15d3b887a93c03d7c3cb5584e00a
-size 2338

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d81aab70fe9efffb1a6897b867bc45772a53476b746b8ab650150d7c7cd22a7
+size 2337

checkpoint-31250/checkpoint-full-31250/training.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a5bfb0a931df6f22904c32a93af0c508a3af501c0410f6a3a0e69a711814e33
-size 2949730416

 version https://git-lfs.github.com/spec/v1
+oid sha256:78f8d855caf0b82d2912afd262a166d8588c500b0b0576d00cf4910834215627
+size 2949730415

real_im_emb_plot.jpg ADDED Viewed