transformers-community
/

contrastive-search

@@ -5,7 +5,7 @@ import torch
 import torch.nn as nn
 from transformers import GenerationConfig, LogitsProcessorList, StoppingCriteriaList
-from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
 from transformers.configuration_utils import PretrainedConfig
 from transformers.generation.utils import (
     ALL_CACHE_NAMES,
@@ -249,17 +249,13 @@ def _contrastive_search(
                     f"{model.__class__.__name__} does not support caching and therefore **can't** be used "
                     "for contrastive search."
                 )
-            # Only those caches have the necesary methods
-            elif not (
-                isinstance(past_key_values, DynamicCache)
-                or (
-                    isinstance(past_key_values, EncoderDecoderCache)
-                    and isinstance(past_key_values.self_attention_cache, DynamicCache)
-                )
             ):
                 raise ValueError(
-                    f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
-                    "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
                 )
         # contrastive_search main logic start:
@@ -294,24 +290,7 @@ def _contrastive_search(
         if not sequential:
             # Replicates the new past_key_values to match the `top_k` candidates
-            past = model_kwargs["past_key_values"]
-            # If it is a static cache, modify it in-place layer after layer to save memory
-            if isinstance(past, DynamicCache) or (
-                isinstance(past, EncoderDecoderCache) and isinstance(past.self_attention_cache, DynamicCache)
-            ):
-                past.batch_repeat_interleave(top_k)
-            else:
-                new_key_values = []
-                for layer in past:
-                    items = []
-                    # item is either the key or the value matrix
-                    for item in layer:
-                        items.append(item.repeat_interleave(top_k, dim=0))
-                    new_key_values.append(tuple(items))
-                past = tuple(new_key_values)
-            model_kwargs["past_key_values"] = past
         if sequential:
             all_outputs = []
@@ -325,19 +304,10 @@ def _contrastive_search(
                     output_hidden_states=True,
                     output_attentions=output_attentions,
                 )
-                if isinstance(outputs["past_key_values"], DynamicCache) or (
-                    isinstance(outputs["past_key_values"], EncoderDecoderCache)
-                    and isinstance(outputs["past_key_values"].self_attention_cache, DynamicCache)
-                ):
-                    # Remove past K-V from output since we don't need to stack later
-                    outputs["past_key_values"] = None
-                    # Remove last token from past K-V since we don't want to append it at this point
-                    model_kwargs["past_key_values"].crop(-1)
-                else:
-                    raise ValueError(
-                        f"Unsupported cache type: {type(outputs['past_key_values'])}. Contrastive search requires "
-                        "dynamic cache, so set `cache_implementation='dynamic'` in the generation config."
-                    )
                 all_outputs.append(outputs)
             outputs = stack_model_outputs(all_outputs, model.config.get_text_config())
@@ -424,22 +394,7 @@ def _contrastive_search(
             next_past_key_values = None
             for possible_cache_name in ALL_CACHE_NAMES:
                 next_past_key_values = next_past_key_values or getattr(outputs, possible_cache_name, None)
-            # Do it in-place layer per layer to save memory
-            if isinstance(next_past_key_values, DynamicCache) or (
-                isinstance(next_past_key_values, EncoderDecoderCache)
-                and isinstance(next_past_key_values.self_attention_cache, DynamicCache)
-            ):
-                next_past_key_values.batch_select_indices(augmented_idx)
-            else:
-                new_key_values = []
-                for layer in next_past_key_values:
-                    items = []
-                    # item is either the key or the value matrix
-                    for item in layer:
-                        items.append(item[augmented_idx, ...])
-                    new_key_values.append(tuple(items))
-                next_past_key_values = tuple(new_key_values)
         logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
         logit_for_next_step = logit_for_next_step.to(input_ids.device)
@@ -503,19 +458,7 @@ def _contrastive_search(
         # Contrastive search works by forward looking at the next token, so we need to exclude it from
         # `past_key_values` to be consistent with the other decoding methods
         if model_kwargs.get("past_key_values") is not None:
-            if isinstance(model_kwargs["past_key_values"], DynamicCache) or (
-                isinstance(model_kwargs["past_key_values"], EncoderDecoderCache)
-                and isinstance(model_kwargs["past_key_values"].self_attention_cache, DynamicCache)
-            ):
-                model_kwargs["past_key_values"].crop(-1)
-            else:
-                past_key_values = []
-                for layer in model_kwargs["past_key_values"]:
-                    layer_past_key_values = []
-                    for item in layer:
-                        layer_past_key_values.append(item[..., :-1, :])
-                    past_key_values.append(tuple(layer_past_key_values))
-                model_kwargs["past_key_values"] = tuple(past_key_values)
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(

 import torch.nn as nn
 from transformers import GenerationConfig, LogitsProcessorList, StoppingCriteriaList
+from transformers.cache_utils import Cache, EncoderDecoderCache
 from transformers.configuration_utils import PretrainedConfig
 from transformers.generation.utils import (
     ALL_CACHE_NAMES,
                     f"{model.__class__.__name__} does not support caching and therefore **can't** be used "
                     "for contrastive search."
                 )
+            elif (
+                not isinstance(past_key_values[0], (tuple, torch.Tensor))
+                or past_key_values[0][0].shape[0] != batch_size
             ):
                 raise ValueError(
+                    f"{model.__class__.__name__} does not have a standard cache format and therefore **can't** be "
+                    "used for contrastive search without further modifications."
                 )
         # contrastive_search main logic start:
         if not sequential:
             # Replicates the new past_key_values to match the `top_k` candidates
+            model_kwargs["past_key_values"].batch_repeat_interleave(top_k)
         if sequential:
             all_outputs = []
                     output_hidden_states=True,
                     output_attentions=output_attentions,
                 )
+                # Remove past K-V from output since we don't need to stack later
+                outputs["past_key_values"] = None
+                # Remove last token from past K-V since we don't want to append it at this point
+                model_kwargs["past_key_values"].crop(-1)
                 all_outputs.append(outputs)
             outputs = stack_model_outputs(all_outputs, model.config.get_text_config())
             next_past_key_values = None
             for possible_cache_name in ALL_CACHE_NAMES:
                 next_past_key_values = next_past_key_values or getattr(outputs, possible_cache_name, None)
+            next_past_key_values.batch_select_indices(augmented_idx)
         logit_for_next_step = torch.stack(torch.split(logits, top_k))[range(batch_size), selected_idx, :]
         logit_for_next_step = logit_for_next_step.to(input_ids.device)
         # Contrastive search works by forward looking at the next token, so we need to exclude it from
         # `past_key_values` to be consistent with the other decoding methods
         if model_kwargs.get("past_key_values") is not None:
+            model_kwargs["past_key_values"].crop(-1)
         if model.config.is_encoder_decoder:
             return GenerateEncoderDecoderOutput(