Spaces:

ibm-research
/

SMI-TED-demo1

Running

App Files Files Community

Enzo Reis de Oliveira commited on Jun 26

Commit

b60e08a

1 Parent(s): 30f063f

Fixing again

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

smi-ted/inference/smi_ted_light/.gitattributes +2 -0
smi-ted/inference/smi_ted_light/fast_transformers/__init__.py +15 -0
smi-ted/inference/smi_ted_light/fast_transformers/aggregate/__init__.py +128 -0
smi-ted/inference/smi_ted_light/fast_transformers/aggregate/aggregate_cpu.cpython-39-x86_64-linux-gnu.so +3 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/__init__.py +20 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/__init__.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/attention_layer.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/full_attention.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/linear_attention.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/attention_layer.py +113 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/causal_linear_attention.py +116 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/clustered_attention.py +195 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/conditional_full_attention.py +66 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/exact_topk_attention.py +88 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/full_attention.py +95 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/improved_clustered_attention.py +268 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/improved_clustered_causal_attention.py +257 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/linear_attention.py +92 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/local_attention.py +101 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention/reformer_attention.py +166 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__init__.py +17 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__pycache__/__init__.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__pycache__/registry.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__pycache__/spec.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/registry.py +61 -0
smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/spec.py +126 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/__init__.py +59 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/__init__.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/attention_builders.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/base.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/transformer_builders.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/attention_builders.py +139 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/base.py +67 -0
smi-ted/inference/smi_ted_light/fast_transformers/builders/transformer_builders.py +550 -0
smi-ted/inference/smi_ted_light/fast_transformers/causal_product/__init__.py +78 -0
smi-ted/inference/smi_ted_light/fast_transformers/causal_product/causal_product_cpu.cpython-39-x86_64-linux-gnu.so +3 -0
smi-ted/inference/smi_ted_light/fast_transformers/clustering/__init__.py +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/clustering/hamming/__init__.py +115 -0
smi-ted/inference/smi_ted_light/fast_transformers/clustering/hamming/cluster_cpu.cpython-39-x86_64-linux-gnu.so +3 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/__init__.py +10 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/__init__.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/event.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/event_dispatcher.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/filters.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/event.py +51 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/event_dispatcher.py +92 -0
smi-ted/inference/smi_ted_light/fast_transformers/events/filters.py +141 -0
smi-ted/inference/smi_ted_light/fast_transformers/feature_maps/__init__.py +12 -0
smi-ted/inference/smi_ted_light/fast_transformers/feature_maps/__pycache__/__init__.cpython-310.pyc +0 -0
smi-ted/inference/smi_ted_light/fast_transformers/feature_maps/__pycache__/base.cpython-310.pyc +0 -0

smi-ted/inference/smi_ted_light/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ smi-ted/inference/smi_ted_light/fast_transformers/*/.so filter=lfs diff=lfs merge=lfs -text
2	+ *.so filter=lfs diff=lfs merge=lfs -text

smi-ted/inference/smi_ted_light/fast_transformers/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Provide a library with fast transformer implementations."""
+__author__ = "Angelos Katharopoulos, Apoorv Vyas"
+__copyright__ = "Copyright (c) 2020 Idiap Research Institute"
+__license__ = "MIT"
+__maintainer__ = "Angelos Katharopoulos, Apoorv Vyas"
+__email__ = "angelos.katharopoulos@idiap.ch, avyas@idiap.ch"
+__url__ = "https://github.com/idiap/fast-transformers"
+__version__ = "0.4.0"

smi-ted/inference/smi_ted_light/fast_transformers/aggregate/__init__.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+import torch
+from .aggregate_cpu import aggregate as aggregate_cpu, \
+    broadcast as broadcast_cpu
+try:
+    from .aggregate_cuda import aggregate as aggregate_gpu, \
+        broadcast as broadcast_gpu
+    from .clustered_aggregate_cuda import \
+        clustered_broadcast as clustered_broadcast_gpu, \
+        clustered_aggregate as clustered_aggregate_gpu
+except ImportError:
+    pass
+def aggregate(X, G, F, Y=None):
+    device = X.device
+    if Y is None:
+        Y = torch.zeros(
+            F.shape + (X.shape[-1],),
+            device=device,
+            dtype=X.dtype
+        )
+    else:
+        Y.zero_()
+    if device.type == "cpu":
+        aggregate_cpu(X, G, F, Y)
+    else:
+        aggregate_gpu(X, G, F, Y)
+    return Y
+def broadcast(Y, G, F, X=None):
+    device = Y.device
+    if X is None:
+        X = torch.zeros(
+            G.shape + (Y.shape[-1],),
+            device=device,
+            dtype=Y.dtype
+        )
+    if device.type == "cpu":
+        broadcast_cpu(Y, G, F, X)
+    else:
+        broadcast_gpu(Y, G, F, X)
+    return X
+# Divide the cluster into groups of equal size
+# as constrained by the shared memory
+def set_group(C, E):
+    C_per_block = int(192 * 64 / (E+1))
+    G_min = (C + C_per_block - 1) // C_per_block
+    for G in range(G_min, C+1):
+        if C % G == 0:
+            return G
+def clustered_broadcast(Y, groups, counts, factors, X=None):
+    device = Y.device
+    if X is None:
+        X = torch.zeros(
+            groups.shape + (Y.shape[-1],),
+            device=device,
+            dtype=Y.dtype
+        )
+    if device.type == "cpu":
+        broadcast_cpu(Y, groups, factors, X)
+    else:
+        N, H, C, E = Y.shape
+        _, _, L, _ = X.shape
+        # Following are some booking keeping parameters to facilitate the
+        # broadcast kernel that takes advantage of clustering
+        # More information can be found in the cuda file
+        with torch.no_grad():
+            threads = 256
+            G = set_group(C, E)
+            group_counts = counts.view(N, H, G, -1).sum(-1)
+            block_counts = (group_counts + threads - 1) // threads
+            total_blocks = block_counts.sum().item()
+            indx_maps = torch.ones(
+                (total_blocks, 5),
+                device=X.device,
+                dtype=torch.int32
+            )
+        clustered_broadcast_gpu(
+            Y,
+            groups,
+            factors,
+            X,
+            block_counts.int(),
+            group_counts.int(),
+            threads,
+            G,
+            total_blocks,
+            indx_maps
+        )
+    return X
+def clustered_aggregate(X, G, F, lengths, Y=None):
+    device = X.device
+    if Y is None:
+        Y = torch.zeros(
+            F.shape + (X.shape[-1],),
+            device=device,
+            dtype=X.dtype
+        )
+    else:
+        Y.zero_()
+    if device.type == "cpu":
+        aggregate_cpu(X, G, F, Y)
+    else:
+        clustered_aggregate_gpu(X, G, F, lengths, Y)
+    return Y

smi-ted/inference/smi_ted_light/fast_transformers/aggregate/aggregate_cpu.cpython-39-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bccb1a374d4649aaef6361cc41c9ffb471086464cc07a0d6d21c5b65adb0711
+size 138248

smi-ted/inference/smi_ted_light/fast_transformers/attention/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implementations of different types of attention mechanisms."""
+from .attention_layer import AttentionLayer
+from .full_attention import FullAttention
+from .linear_attention import LinearAttention
+#from .causal_linear_attention import CausalLinearAttention
+#from .clustered_attention import ClusteredAttention
+#from .improved_clustered_attention import ImprovedClusteredAttention
+#from .reformer_attention import ReformerAttention
+#from .conditional_full_attention import ConditionalFullAttention
+#from .exact_topk_attention import ExactTopKAttention
+#from .improved_clustered_causal_attention import ImprovedClusteredCausalAttention
+#from .local_attention import LocalAttention

smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (502 Bytes). View file

smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/attention_layer.cpython-310.pyc ADDED Viewed

Binary file (4.14 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/full_attention.cpython-310.pyc ADDED Viewed

Binary file (3.32 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/attention/__pycache__/linear_attention.cpython-310.pyc ADDED Viewed

Binary file (2.96 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/attention/attention_layer.py ADDED Viewed

	@@ -0,0 +1,113 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""The base attention layer performs all the query key value projections and
+output projections leaving the implementation of the attention to the inner
+attention module.
+The transformer layers, however, are agnostic of the attention implementation
+and any layer that implements the same interface can substitute for the
+attention layer.
+"""
+from torch.nn import Linear, Module
+from ..events import EventDispatcher, QKVEvent
+class AttentionLayer(Module):
+    """Implement the attention layer. Namely project the inputs to multi-head
+    queries, keys and values, call the attention implementation and then
+    reproject the output.
+    It can be thought of as a decorator (see decorator design patter) of an
+    attention layer.
+    Arguments
+    ---------
+        attention: Specific inner attention implementation that just computes a
+                   weighted average of values given a similarity of queries and
+                   keys.
+        d_model: The input feature dimensionality
+        n_heads: The number of heads for the multi head attention
+        d_keys: The dimensionality of the keys/queries
+                (default: d_model/n_heads)
+        d_values: The dimensionality of the values (default: d_model/n_heads)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, attention, d_model, n_heads, d_keys=None,
+                 d_values=None, event_dispatcher=""):
+        super(AttentionLayer, self).__init__()
+        # Fill d_keys and d_values
+        d_keys = d_keys or (d_model//n_heads)
+        d_values = d_values or (d_model//n_heads)
+        self.inner_attention = attention
+        self.query_projection = Linear(d_model, d_keys * n_heads)
+        self.key_projection = Linear(d_model, d_keys * n_heads)
+        self.value_projection = Linear(d_model, d_values * n_heads)
+        self.out_projection = Linear(d_values * n_heads, d_model)
+        self.n_heads = n_heads
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        """Apply attention to the passed in queries/keys/values after
+        projecting them to multiple heads.
+        In the argument description we make use of the following sizes
+            - N: the batch size
+            - L: The maximum length of the queries
+            - S: The maximum length of the keys (the actual length per sequence
+              is given by the length mask)
+            - D: The input feature dimensionality passed in the constructor as
+              'd_model'
+        Arguments
+        ---------
+            queries: (N, L, D) The tensor containing the queries
+            keys: (N, S, D) The tensor containing the keys
+            values: (N, S, D) The tensor containing the values
+            attn_mask: An implementation of BaseMask that encodes where each
+                       query can attend to
+            query_lengths: An implementation of  BaseMask that encodes how
+                           many queries each sequence in the batch consists of
+            key_lengths: An implementation of BaseMask that encodes how
+                         many queries each sequence in the batch consists of
+        Returns
+        -------
+            The new value for each query as a tensor of shape (N, L, D).
+        """
+        # Extract the dimensions into local variables
+        N, L, _ = queries.shape
+        _, S, _ = keys.shape
+        H = self.n_heads
+        # Project the queries/keys/values
+        queries = self.query_projection(queries).view(N, L, H, -1)
+        keys = self.key_projection(keys).view(N, S, H, -1)
+        values = self.value_projection(values).view(N, S, H, -1)
+        # Let the world know of the qkv
+        self.event_dispatcher.dispatch(QKVEvent(self, queries, keys, values))
+        # Compute the attention
+        new_values = self.inner_attention(
+            queries,
+            keys,
+            values,
+            attn_mask,
+            query_lengths,
+            key_lengths
+        ).view(N, L, -1)
+        # Project the output and return
+        return self.out_projection(new_values)

smi-ted/inference/smi_ted_light/fast_transformers/attention/causal_linear_attention.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement causally masked linear attention."""
+import torch
+from torch.nn import Module
+from ..attention_registry import AttentionRegistry, Optional, Callable, Int, \
+    EventDispatcherInstance
+from ..events import EventDispatcher
+from ..causal_product import causal_dot_product
+from ..feature_maps import elu_feature_map
+def causal_linear(Q, K, V):
+    Q = Q.permute(0,2,1,3).contiguous()
+    K = K.permute(0,2,1,3).contiguous()
+    V = V.permute(0,2,1,3).contiguous()
+    V_new = causal_dot_product(Q, K, V)
+    return V_new.permute(0,2,1,3).contiguous()
+class CausalLinearAttention(Module):
+    """Implement causally masked attention using dot product of feature maps in
+    O(N D^2) complexity.
+    See fast_transformers.attention.linear_attention.LinearAttention for the
+    general concept of replacing the softmax with feature maps. In addition to
+    that, we also make use of the fact that causal masking is a triangular mask
+    which allows us to apply the masking and still compute the attention in O(N
+    D^2) complexity.
+    Arguments
+    ---------
+        feature_map: callable, a callable that applies the feature map to the
+                     last dimension of a tensor (default: elu(x)+1)
+        eps: float, a small number to ensure the numerical stability of the
+             denominator (default: 1e-6)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, query_dimensions, feature_map=None, eps=1e-6,
+                 event_dispatcher=""):
+        super(CausalLinearAttention, self).__init__()
+        self.feature_map = (
+            feature_map(query_dimensions) if feature_map else
+            elu_feature_map(query_dimensions)
+        )
+        self.eps = eps
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def _make_sizes_compatible(self, Q, K):
+        """Either slice or pad K in case that the sizes do not match between Q
+        and K."""
+        N, L, H, E = Q.shape
+        _, S, _, _ = K.shape
+        if L == S:
+            return Q, K
+        if L < S:
+            return Q, K[:, :L, :, :]
+        if L > S:
+            return Q, torch.cat([K, K.new_zeros(N, L-S, H, E)], dim=1)
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Apply the feature map to the queries and keys
+        self.feature_map.new_feature_map(queries.device)
+        Q = self.feature_map.forward_queries(queries)
+        K = self.feature_map.forward_keys(keys)
+        # Apply the key padding mask and make sure the attn_mask is a
+        # lower triangular causal mask
+        if not attn_mask.lower_triangular:
+            raise RuntimeError(("CausalLinearAttention only supports full "
+                                "lower triangular masks"))
+        K = K * key_lengths.float_matrix[:, :, None, None]
+        # Ensure that Q and K have compatible sizes for the following
+        # computations, namely L == S
+        Q, K = self._make_sizes_compatible(Q, K)
+        # TODO: Shall we divide the Q and K with a relatively large number to
+        #       avoid numerical instabilities in computing the denominator?
+        #       We used to divide each with the max norm of all q and k but
+        #       that seems relatively costly for a simple normalization.
+        # Compute the normalizers
+        Z = 1/(torch.einsum("nlhi,nlhi->nlh", Q, K.cumsum(1)) + self.eps)
+        # Compute the unnormalized result
+        V = causal_linear(
+            Q,
+            K,
+            values
+        )
+        return V * Z[:, :, :, None]
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "causal-linear", CausalLinearAttention,
+    [
+        ("query_dimensions", Int),
+        ("feature_map", Optional(Callable)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/clustered_attention.py ADDED Viewed

	@@ -0,0 +1,195 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement clustered self attention."""
+from math import sqrt
+import torch
+import torch.autograd
+from torch.nn import Dropout, Module
+from torch.nn.init import normal_
+from ..attention_registry import AttentionRegistry, Optional, Float, Int, \
+    Bool, EventDispatcherInstance
+from ..events import EventDispatcher
+from ..masking import FullMask
+from ..aggregate import clustered_aggregate, clustered_broadcast
+from ..clustering.hamming import cluster
+from ..hashing import compute_hashes
+class _GroupQueries(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Q, clusters, counts, lengths):
+        factors = 1./counts.float()
+        q_grouped = clustered_aggregate(Q, clusters, factors, lengths)
+        ctx.save_for_backward(clusters, counts, factors)
+        return q_grouped
+    @staticmethod
+    def backward(ctx, grad_q_grouped):
+        clusters, counts, factors = ctx.saved_tensors
+        grad_q = clustered_broadcast(grad_q_grouped, clusters, counts, factors)
+        return grad_q, None, None, None
+class _BroadcastValues(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, v_grouped, clusters, counts, lengths):
+        factors = torch.ones_like(counts, dtype=v_grouped.dtype)
+        V = clustered_broadcast(v_grouped, clusters, counts, factors)
+        ctx.save_for_backward(clusters, counts, factors, lengths)
+        return V
+    @staticmethod
+    def backward(ctx, grad_v):
+        clusters, counts, factors, lengths = ctx.saved_tensors
+        grad_v_grouped = clustered_aggregate(grad_v, clusters, factors, lengths)
+        return grad_v_grouped, None, None, None
+class ClusteredAttention(Module):
+    """Use LSH and clustering in the resulting Hamming space to group queries
+    that will have minimal L2 distance from each other.
+    Given the queries, keys, and values as Q, K, and V respectively, we
+    first cluster the queries in "C" groups and compute the "C" query centroids
+    Q_c.
+    We now use to the centroids Q_c to compute the attention using:
+        V'_c = softmax(Q_c.mm(K.t()), dim=-1).mm(V).
+    Now the computed values V'_c are "broadcasted" back to the query members
+    of the corresponding cluster.
+    Arguments
+    ---------
+        clusters: How many clusters to group the queries into
+        iterations: The number of lloyd iterations to perform (default: 10)
+        bits: How many bits to use for the hash (default: 32)
+        hash_bias: If true, hamming distance proportional to L2 distance
+                   If false, hamming distance proportional to cosine distance
+                   (default: True)
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, clusters, iterations=10, bits=32,
+                 hash_bias=True, softmax_temp=None, attention_dropout=0.1,
+                 event_dispatcher=""):
+        super(ClusteredAttention, self).__init__()
+        self.clusters = clusters
+        self.iterations = iterations
+        self.bits = bits
+        self.hash_bias = hash_bias
+        self.softmax_temp = softmax_temp
+        self.dropout = Dropout(attention_dropout)
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def _create_query_groups(self, Q, query_lengths):
+        N, H, L, E = Q.shape
+        # Compute the hashes for all the queries
+        planes = Q.new_empty((self.bits, E+1))
+        normal_(planes)
+        if not self.hash_bias:
+            planes[:, -1] = 0
+        hashes = compute_hashes(Q.view(N*H*L, E), planes).view(N, H, L)
+        # Cluster the hashes and return the cluster index per query
+        clusters, counts =  cluster(
+            hashes,
+            query_lengths._lengths.int(),
+            clusters=self.clusters,
+            iterations=self.iterations,
+            bits=self.bits
+        )
+        sorted_clusters, sorted_indx = torch.sort(clusters, dim=-1)
+        return (sorted_clusters, counts), sorted_indx
+    def _group_queries(self, Q, groups, lengths):
+        """Aggregate the Qs based on the index of cluster they belong to. Make
+        sure to allow for gradient propagation backwards from the grouped
+        queries to each query."""
+        q_grouped = _GroupQueries.apply(Q, *groups, lengths)
+        return q_grouped
+    def _broadcast_values(self, V, groups, lengths):
+        """Broadcast the values back to the correct positions but make sure
+        that the gradient flows properly."""
+        V_new = _BroadcastValues.apply(V.contiguous(), *groups, lengths)
+        return V_new
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Make sure that there is no attention mask
+        assert attn_mask.all_ones, ("Clustered attention cannot use an "
+                                    "arbitrary attention mask.")
+        queries = queries.permute(0,2,1,3).contiguous()
+        keys = keys.permute(0,2,1,3).contiguous()
+        values = values.permute(0,2,1,3).contiguous()
+        N, H, L, E = queries.shape
+        _, _, S, D = values.shape
+        softmax_temp = self.softmax_temp or 1./sqrt(E)
+        # Cluster the queries into groups
+        groups, sorted_indx = self._create_query_groups(queries, query_lengths)
+        # Re-organize queries so that first group belong to first cluster
+        # next to second cluster and so on. This improves kernel implementations.
+        # Note that this step is introduced after NeurIPS submission and
+        # now the complexity is O(N log(N)).
+        q_offset = torch.arange(N*H, device=queries.device).unsqueeze(-1) * L
+        q_flat = (sorted_indx.view(N*H, -1) + q_offset).reshape(-1)
+        s_queries = queries.reshape(-1, E).index_select(0, q_flat).view(N,H,L,E)
+        # Aggregate the re-arranged queries.
+        Q_grouped = self._group_queries(s_queries, groups, query_lengths._lengths.int())
+        # Compute the attention
+        QK = torch.einsum("nhle,nhse->nhls", Q_grouped, keys)
+        QK = QK + key_lengths.additive_matrix[:, None, None, :]
+        A = self.dropout(torch.softmax(softmax_temp * QK, dim=-1))
+        V = torch.einsum("nhls,nhsd->nhld", A, values)
+        # Broadcast grouped attention
+        V_broadcast = self._broadcast_values(V, groups, query_lengths._lengths.int())
+        # Reverse the previous mapping
+        rev_indx = torch.argsort(sorted_indx, dim=-1)
+        q_rev_flat = (rev_indx.view(N*H, -1) + q_offset).reshape(-1)
+        V_new = V_broadcast.reshape(-1, D).index_select(0, q_rev_flat).view(N,H,L,D)
+        V_new = V_new.permute(0, 2, 1, 3).contiguous()
+        return V_new
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "clustered", ClusteredAttention,
+    [
+        ("clusters", Int),
+        ("iterations", Optional(Int, 10)),
+        ("bits", Optional(Int, 63)),
+        ("hash_bias", Optional(Bool, True)),
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/conditional_full_attention.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement a self attention that delegates to full attention or another
+attention depending on the input sequence length."""
+import torch
+from torch.nn import Module
+from ..attention_registry import AttentionRegistry, Optional, Int, Float, \
+    EventDispatcherInstance
+from ..events import EventDispatcher
+from .full_attention import FullAttention
+class ConditionalFullAttention(Module):
+    """"Delegate to full attention if the input sequence is short.
+    Arguments
+    ---------
+        other_attention: Use the passed attention module if the sequence is
+                         longer than 'length_limit'.
+        length_limit: An integer denoting the maximum sequence length to
+                      consider.
+        softmax_temp: See fast_transformers.attention.full_attention.
+        attention_dropout: See fast_transformers.attention.full_attention.
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, other_attention, length_limit=512, softmax_temp=None,
+                 attention_dropout=0.1, event_dispatcher=""):
+        super(ConditionalFullAttention, self).__init__()
+        self.full_attention = FullAttention(softmax_temp, attention_dropout)
+        self.other_attention = other_attention
+        self.length_limit = length_limit
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Extract some shapes to compare with the length limit
+        L = queries.shape[1]
+        S = values.shape[1]
+        if L > self.length_limit or S > self.length_limit:
+            return self.other_attention(queries, keys, values, attn_mask,
+                                        query_lengths, key_lengths)
+        else:
+            return self.full_attention(queries, keys, values, attn_mask,
+                                       query_lengths, key_lengths)
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "conditional-full", ConditionalFullAttention,
+    [
+        ("length_limit", Optional(Int, 512)),
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/exact_topk_attention.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement the oracle top-k attention. The top-k keys are exact ones.
+MultiHeadAttention module. Note that this module is to be used in conjuction
+with the AttentionLayer in order to work."""
+from math import sqrt
+import torch
+from torch.nn import Dropout, Module
+from ..attention_registry import AttentionRegistry, Optional, Int, Float, \
+    EventDispatcherInstance
+from ..events import EventDispatcher
+class ExactTopKAttention(Module):
+    """Implement the oracle top-k softmax attention.
+    Arguments
+    ---------
+        top-k: The top k keys to attend to  (default: 32)
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, topk=32, softmax_temp=None, attention_dropout=0.1,
+                 event_dispatcher=""):
+        super(ExactTopKAttention, self).__init__()
+        self.topk = topk
+        self.softmax_temp = softmax_temp
+        self.dropout = Dropout(attention_dropout)
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Extract some shapes and compute the temperature
+        N, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        softmax_temp = self.softmax_temp or 1./sqrt(E)
+        # Compute the unnormalized attention and apply the masks
+        QK = torch.einsum("nlhe,nshe->nhls", queries, keys)
+        topk = min(self.topk, S)
+        if not attn_mask.all_ones:
+            QK = QK + attn_mask.additive_matrix
+        QK = QK + key_lengths.additive_matrix[:, None, None]
+        topk_values, topk_idx = torch.topk(QK, topk, sorted=False, dim=-1)
+        mask = QK.new_ones(QK.shape) *  float("-inf")
+        mask[
+            torch.arange(N, device=QK.device).view(N, 1, 1, 1),
+            torch.arange(H, device=QK.device).view(1, H, 1, 1),
+            torch.arange(L, device=QK.device).view(1, 1, L, 1),
+            topk_idx,
+        ] = 0.
+        QK = QK + mask
+        # Compute the attention and the weighted average
+        A = self.dropout(torch.softmax(softmax_temp * QK, dim=-1))
+        V = torch.einsum("nhls,nshd->nlhd", A, values)
+        # Make sure that what we return is contiguous
+        return V.contiguous()
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "exact-topk", ExactTopKAttention,
+    [
+        ("topk", Optional(Int, 32)),
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/full_attention.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement the full attention similar to the one implemented by PyTorch's
+MultiHeadAttention module. Note that this module is to be used in conjuction
+with the `fast_transformers.attention.attention_layer.AttentionLayer` in order
+to work."""
+from math import sqrt
+import torch
+from torch.nn import Dropout, Module
+from ..attention_registry import AttentionRegistry, Optional, Float, \
+    EventDispatcherInstance
+from ..events import EventDispatcher, AttentionEvent
+class FullAttention(Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, softmax_temp=None, attention_dropout=0.1,
+                 event_dispatcher=""):
+        super(FullAttention, self).__init__()
+        self.softmax_temp = softmax_temp
+        self.dropout = Dropout(attention_dropout)
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            queries: (N, L, H, E) The tensor containing the queries
+            keys: (N, S, H, E) The tensor containing the keys
+            values: (N, S, H, D) The tensor containing the values
+            attn_mask: An implementation of BaseMask that encodes where each
+                       query can attend to
+            query_lengths: An implementation of  BaseMask that encodes how
+                           many queries each sequence in the batch consists of
+            key_lengths: An implementation of BaseMask that encodes how
+                         many queries each sequence in the batch consists of
+        """
+        # Extract some shapes and compute the temperature
+        N, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        softmax_temp = self.softmax_temp or 1./sqrt(E)
+        # Scale the queries instead of applying the softmax temperature to the
+        # dot products
+        queries = queries * softmax_temp
+        # Compute the unnormalized attention and apply the masks
+        QK = torch.einsum("nlhe,nshe->nhls", queries, keys)
+        if not attn_mask.all_ones:
+            QK = QK + attn_mask.additive_matrix
+        if not key_lengths.all_ones:
+            QK = QK + key_lengths.additive_matrix[:, None, None]
+        # Compute the attention and the weighted average
+        A = self.dropout(torch.softmax(QK, dim=-1))
+        V = torch.einsum("nhls,nshd->nlhd", A, values)
+        # Let the world know of the attention matrix
+        self.event_dispatcher.dispatch(AttentionEvent(self, A))
+        # Make sure that what we return is contiguous
+        return V.contiguous()
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "full", FullAttention,
+    [
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/improved_clustered_attention.py ADDED Viewed

	@@ -0,0 +1,268 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement improved clustered self attention."""
+from math import sqrt
+import torch
+import torch.autograd
+from torch.nn import Dropout, Module
+from torch.nn.init import normal_
+from ..attention_registry import AttentionRegistry, Optional, Float, Int, \
+    Bool, EventDispatcherInstance
+from ..events import EventDispatcher
+from ..masking import FullMask
+from ..aggregate import clustered_aggregate, clustered_broadcast
+from ..clustering.hamming import cluster
+from ..hashing import compute_hashes
+from ..sparse_product import sparse_dot_product, sparse_weighted_average
+from ..sparse_product import clustered_sparse_dot_product, \
+    clustered_sparse_weighted_average
+class _GroupQueries(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Q, clusters, counts, lengths):
+        factors = 1./counts.float()
+        q_grouped = clustered_aggregate(Q, clusters, factors, lengths)
+        ctx.save_for_backward(clusters, counts, factors)
+        return q_grouped
+    @staticmethod
+    def backward(ctx, grad_q_grouped):
+        clusters, counts, factors = ctx.saved_tensors
+        grad_q = clustered_broadcast(grad_q_grouped, clusters, counts, factors)
+        return grad_q, None, None, None
+class _BroadcastValues(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, v_grouped, clusters, counts, lengths):
+        factors = torch.ones_like(counts, dtype=v_grouped.dtype)
+        V = clustered_broadcast(v_grouped, clusters, counts, factors)
+        ctx.save_for_backward(clusters, counts, factors, lengths)
+        return V
+    @staticmethod
+    def backward(ctx, grad_v):
+        clusters, counts, factors, lengths = ctx.saved_tensors
+        grad_v_grouped = clustered_aggregate(grad_v, clusters, factors, lengths)
+        return grad_v_grouped, None, None, None, None
+class ImprovedClusteredAttention(Module):
+    """
+    Immproved clustered attention approximation by recompution attention
+    for each query with the top-k keys for the corresponding cluster.
+    Given the queries, keys, and values as Q, K, and V respectively, we
+    first cluster the queries in "C" groups and compute the "C" query centroids
+    Q_c.
+    We now use to the centroids Q_c to identify the top-k keys with highest
+    dot products.
+    Subsequently, for each query we compute the sparse dot product with
+    the corresponding top-k keys to improve the attention approximation.
+    Arguments
+    ---------
+        clusters: How many clusters to group the queries into
+        iterations: The number of lloyd iterations to perform (default: 10)
+        bits: How many bits to use for the hash (default: 32)
+        hash_bias: If true, hamming distance proportional to L2 distance
+                   If false, hamming distance proportional to cosine distance
+                   (default: True)
+        topk: Number of top-k keys to for improved approximation (default: 32)
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, clusters, iterations=10, bits=32,
+                 hash_bias=True, topk=32, softmax_temp=None,
+                 attention_dropout=0.1, event_dispatcher=""):
+        super(ImprovedClusteredAttention, self).__init__()
+        self.clusters = clusters
+        self.iterations = iterations
+        self.bits = bits
+        self.hash_bias = hash_bias
+        self.topk = topk
+        self.softmax_temp = softmax_temp
+        self.dropout = Dropout(attention_dropout)
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def _create_query_groups(self, Q, query_lengths):
+        N, H, L, E = Q.shape
+        # Compute the hashes for all the queries
+        planes = Q.new_empty((self.bits, E+1))
+        normal_(planes)
+        if not self.hash_bias:
+            planes[:, -1] = 0
+        hashes = compute_hashes(Q.view(N*H*L, E), planes).view(N, H, L)
+        # Cluster the hashes and return the cluster index per query
+        clusters, counts =  cluster(
+            hashes,
+            query_lengths._lengths.int(),
+            clusters=self.clusters,
+            iterations=self.iterations,
+            bits=self.bits
+        )
+        sorted_clusters, sorted_indx = torch.sort(clusters, dim=-1)
+        return (sorted_clusters, counts), sorted_indx
+    def _topk_attention(self, Q, K, V,
+                        clusters, counts,
+                        topk, topk_values,
+                        A_bottomk, softmax_temp,
+                        query_lengths):
+        """Return the attention with just the topk heads."""
+        # Extract some indices
+        N, H, L, E = Q.shape
+        _, _, S, _ = K.shape
+        _, _, C, k = topk.shape
+        # We need to pass the output tensor to initialize to 0
+        QK = clustered_sparse_dot_product(
+            Q, K, topk,
+            clusters, counts,
+            query_lengths._lengths.int()
+        )
+        # We need to mask the topk dot products if topk > input_length
+        QK = QK.masked_fill(
+            torch.isinf(topk_values[:,0,0,:]).view(N, 1, 1, k),
+            float("-inf")
+        )
+        A = torch.softmax(softmax_temp * QK, dim=-1)
+        assert A_bottomk.is_contiguous()
+        A_bottomk = clustered_broadcast(
+            A_bottomk.unsqueeze(3),
+            clusters,
+            counts,
+            torch.ones_like(counts, dtype=torch.float32)
+        )
+        A = A * (1.0 - A_bottomk)
+        A = self.dropout(A)
+        assert A.is_contiguous()
+        V_new = clustered_sparse_weighted_average(A, V, topk, clusters, counts)
+        return V_new
+    def _broadcast_values(self, V, clusters, counts, lengths):
+        """Broadcast the values back to the correct positions but make sure
+        that the gradient flows properly."""
+        V_new = _BroadcastValues.apply(V.contiguous(), clusters, counts, lengths)
+        return V_new
+    def _bottomk_attention(self, QK, V, clusters, counts, query_lengths, topk, softmax_temp):
+        """Return the attention with just the bottomk keys."""
+        N, H, C, S = QK.shape
+        A = torch.softmax(softmax_temp * QK, dim=-1)
+        mask = QK.new_ones(QK.shape)
+        mask[
+            torch.arange(N, device=QK.device).view(N, 1, 1, 1),
+            torch.arange(H, device=QK.device).view(1, H, 1, 1),
+            torch.arange(C, device=QK.device).view(1, 1, C, 1),
+            topk,
+        ] = 0
+        A = A * mask
+        A_bottomk = A.sum(-1)
+        A = self.dropout(A)
+        # Compute the values
+        V_new = torch.einsum("nhls,nhse->nhle", A, V)
+        # Broadcast the values back depending on the groups
+        V_new = self._broadcast_values(V_new, clusters, counts, query_lengths._lengths.int())
+        return V_new, A_bottomk
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Make sure that there is no attention mask
+        assert attn_mask.all_ones, ("Improved-clustered attention cannot "
+                                    "use an arbitrary attention mask.")
+        queries = queries.permute(0,2,1,3).contiguous()
+        keys = keys.permute(0,2,1,3).contiguous()
+        values = values.permute(0,2,1,3).contiguous()
+        N, H, L, E = queries.shape
+        _, _, S, D = values.shape
+        softmax_temp = self.softmax_temp or 1./sqrt(E)
+        # Cluster the queries into groups
+        groups, sorted_indx = self._create_query_groups(queries, query_lengths)
+        clusters, counts = groups
+        # Re-organize queries so that first group belong to first cluster
+        # next to second cluster and so on. This improves kernel implementations.
+        # Note that this step is introduced after NeurIPS submission and
+        # now the complexity is O(N log(N)).
+        q_offset = torch.arange(N*H, device=queries.device).unsqueeze(-1) * L
+        q_flat = (sorted_indx.view(N*H, -1) + q_offset).reshape(-1)
+        s_queries = queries.reshape(-1, E).index_select(0, q_flat).view(N,H,L,E)
+        # Aggregate the re-arranged queries.
+        Q_grouped = _GroupQueries.apply(s_queries, *groups, query_lengths.lengths.int())
+        # Compute the attention
+        QK = torch.einsum("nhle,nhse->nhls", Q_grouped, keys)
+        QK = QK + key_lengths.additive_matrix[:, None, None, :]
+        topk_values, topk = torch.topk(QK, min(self.topk, S), sorted=False, dim=-1)
+        assert topk.is_contiguous()
+        # Now compute the attention with only the bottom keys
+        V_bottomk, A_bottomk = self._bottomk_attention(
+            QK, values,
+            clusters, counts,
+            query_lengths,
+            topk,
+            softmax_temp
+        )
+        # Now compute the attention with only the top keys
+        V_topk = self._topk_attention(
+            s_queries, keys, values,
+            clusters, counts,
+            topk, topk_values,
+            A_bottomk,
+            softmax_temp,
+            query_lengths
+        )
+        V_sorted_new = V_topk + V_bottomk
+        # Reverse the previous mapping
+        sorted_rev_indx = torch.argsort(sorted_indx, dim=-1)
+        q_rev_flat = (sorted_rev_indx.view(N*H, -1) + q_offset).reshape(-1)
+        V_new = V_sorted_new.reshape(-1, D).index_select(0, q_rev_flat).view(N,H,L,D)
+        return V_new.permute(0, 2, 1, 3).contiguous()
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "improved-clustered", ImprovedClusteredAttention,
+    [
+        ("clusters", Int),
+        ("iterations", Optional(Int, 10)),
+        ("bits", Optional(Int, 63)),
+        ("hash_bias", Optional(Bool, True)),
+        ("topk", Optional(Int, 32)),
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/improved_clustered_causal_attention.py ADDED Viewed

	@@ -0,0 +1,257 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement improved clustered causal self attention."""
+from math import sqrt
+import torch
+import torch.autograd
+from torch.nn import Dropout, Module
+from torch.nn.init import normal_
+from ..attention_registry import AttentionRegistry, Optional, Float, Int, \
+    Bool, EventDispatcherInstance
+from ..events import EventDispatcher
+from ..masking import FullMask
+from ..aggregate import clustered_aggregate, clustered_broadcast
+from ..clustering.hamming import cluster
+from ..hashing import compute_hashes
+from ..sparse_product import sparse_dot_product, sparse_weighted_average
+from ..sparse_product import clustered_sparse_dot_product, \
+    clustered_sparse_weighted_average
+class _GroupQueries(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Q, clusters, counts, lengths):
+        factors = 1./counts.float()
+        q_grouped = clustered_aggregate(Q, clusters, factors, lengths)
+        ctx.save_for_backward(clusters, counts, factors)
+        return q_grouped
+    @staticmethod
+    def backward(ctx, grad_q_grouped):
+        clusters, counts, factors = ctx.saved_tensors
+        grad_q = clustered_broadcast(grad_q_grouped, clusters, counts, factors)
+        return grad_q, None, None, None
+class _BroadcastValues(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, v_grouped, clusters, counts, lengths):
+        factors = torch.ones_like(counts, dtype=v_grouped.dtype)
+        V = clustered_broadcast(v_grouped, clusters, counts, factors)
+        ctx.save_for_backward(clusters, counts, factors, lengths)
+        return V
+    @staticmethod
+    def backward(ctx, grad_v):
+        clusters, counts, factors, lengths = ctx.saved_tensors
+        grad_v_grouped = clustered_aggregate(grad_v, clusters, factors, lengths)
+        return grad_v_grouped, None, None, None, None
+class ImprovedClusteredCausalAttention(Module):
+    """
+    Immproved clustered causal attention approximation by recomputing attention
+    for each query with the top-k keys for the corresponding cluster.
+    Given the queries, keys, and values as Q, K, and V respectively, we
+    first cluster the queries in "C" groups and compute the "C" query centroids
+    Q_c.
+    We now use to the centroids Q_c to identify the top-k keys with highest
+    dot products.
+    Subsequently, for each query we compute the sparse dot product with
+    the corresponding top-k keys to improve the attention approximation.
+    Key difference with improved clustered attention is that we only use
+    top-k keys with causal mask, we do not compute attention on the
+    bottom-k keys.
+    Arguments
+    ---------
+        clusters: How many clusters to group the queries into
+        iterations: The number of lloyd iterations to perform (default: 10)
+        bits: How many bits to use for the hash (default: 32)
+        hash_bias: If true, hamming distance proportional to L2 distance
+                   If false, hamming distance proportional to cosine distance
+                   (default: True)
+        topk: Number of top-k keys to for improved approximation (default: 32)
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, clusters, iterations=10, bits=32,
+                 hash_bias=True, topk=32, softmax_temp=None,
+                 attention_dropout=0.1, event_dispatcher=""):
+        super(ImprovedClusteredCausalAttention, self).__init__()
+        self.clusters = clusters
+        self.iterations = iterations
+        self.bits = bits
+        self.hash_bias = hash_bias
+        self.topk = topk
+        self.softmax_temp = softmax_temp
+        self.dropout = Dropout(attention_dropout)
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def _create_query_groups(self, Q, query_lengths):
+        N, H, L, E = Q.shape
+        # Compute the hashes for all the queries
+        planes = Q.new_empty((self.bits, E+1))
+        normal_(planes)
+        if not self.hash_bias:
+            planes[:, -1] = 0
+        hashes = compute_hashes(Q.view(N*H*L, E), planes).view(N, H, L)
+        # Cluster the hashes and return the cluster index per query
+        clusters, counts =  cluster(
+            hashes,
+            query_lengths.lengths.int(),
+            clusters=self.clusters,
+            iterations=self.iterations,
+            bits=self.bits
+        )
+        sorted_clusters, sorted_indx = torch.sort(clusters, dim=-1)
+        return (sorted_clusters, counts), sorted_indx
+    def _topk_attention(self, Q, K, V,
+                        q_flat, q_rev_flat,
+                        clusters, counts,
+                        topk, topk_values,
+                        softmax_temp,
+                        query_lengths):
+        """Return the attention with just the topk heads."""
+        # Extract some indices
+        N, H, L, E = Q.shape
+        _, _, S, _ = K.shape
+        _, _, C, k = topk.shape
+        # We need to pass the output tensor to initialize to 0
+        QK = clustered_sparse_dot_product(
+            Q, K, topk,
+            clusters, counts,
+            query_lengths.lengths.int()
+        )
+        # We need to mask out the future
+        assert topk.is_contiguous()
+        topk_broadcast = clustered_broadcast(
+            topk.float(),
+            clusters,
+            counts,
+            torch.ones_like(counts, dtype=torch.float32)
+        )
+        # Need to be careful here we changed the order of the keys the
+        # masking on future needs to be applied in the same way
+        seq_ids = torch.arange(L, device=QK.device).view(1, 1, L, 1).repeat(N, H, 1, 1)
+        # permute the ids in the same way as input so as to mask the right
+        # entries for each query
+        s_seq_ids = seq_ids.reshape(-1, 1).index_select(0, q_flat).view(N,H,L,1)
+        future_mask = topk_broadcast.long() > s_seq_ids
+        QK = QK.masked_fill(
+            future_mask,
+            float("-1e7")
+        )
+        A = torch.softmax(softmax_temp * QK, dim=-1)
+        # Mask again to ensure no probabilities leak due to float(-1e7)
+        # Leakage could be very high as we use a small top-k
+        A = A * (1. - future_mask.float())
+        A = self.dropout(A)
+        assert A.is_contiguous()
+        V_new = clustered_sparse_weighted_average(A, V, topk, clusters, counts)
+        return V_new
+    def _broadcast_values(self, V, clusters, counts, lengths):
+        """Broadcast the values back to the correct positions but make sure
+        that the gradient flows properly."""
+        V_new = _BroadcastValues.apply(V.contiguous(), clusters, counts, lengths)
+        return V_new
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Apply the key padding mask and make sure the attn_mask is a
+        # lower triangular causal mask
+        if not attn_mask.lower_triangular:
+            raise RuntimeError(("ImprovedClusteredCausalAttention only supports "
+                                "lower triangular masks"))
+        queries = queries.permute(0,2,1,3).contiguous()
+        keys = keys.permute(0,2,1,3).contiguous()
+        values = values.permute(0,2,1,3).contiguous()
+        N, H, L, E = queries.shape
+        _, _, S, D = values.shape
+        softmax_temp = self.softmax_temp or 1./sqrt(E)
+        # Cluster the queries into groups
+        groups, sorted_indx = self._create_query_groups(queries, query_lengths)
+        clusters, counts = groups
+        # Re-organize queries so that first group belong to first cluster
+        # next to second cluster and so on. This improves kernel implementations.
+        # Note that this step is introduced after NeurIPS submission and
+        # now the complexity is O(N log(N)).
+        q_offset = torch.arange(N*H, device=queries.device).unsqueeze(-1) * L
+        q_flat = (sorted_indx.view(N*H, -1) + q_offset).reshape(-1)
+        s_queries = queries.reshape(-1, E).index_select(0, q_flat).view(N,H,L,E)
+        # Aggregate the re-arranged queries.
+        Q_grouped = _GroupQueries.apply(s_queries, *groups, query_lengths.lengths.int())
+        # Compute the attention
+        QK = torch.einsum("nhle,nhse->nhls", Q_grouped, keys)
+        QK = QK + key_lengths.additive_matrix[:, None, None, :]
+        # Set topk to minimum of key lengths if it is smaller than self.topk
+        cur_topk = min(self.topk, min(key_lengths.lengths).item())
+        topk_values, topk = torch.topk(QK, cur_topk, sorted=False, dim=-1)
+        assert topk.is_contiguous()
+        # Reverse mapping
+        sorted_rev_indx = torch.argsort(sorted_indx, dim=-1)
+        q_rev_flat = (sorted_rev_indx.view(N*H, -1) + q_offset).reshape(-1)
+        # Compute the attention with only the top keys
+        V_topk = self._topk_attention(
+            s_queries, keys, values,
+            q_flat, q_rev_flat,
+            clusters, counts,
+            topk, topk_values,
+            softmax_temp,
+            query_lengths
+        )
+        V_sorted_new = V_topk
+        # Reverse the mapping to get correct values
+        V_new = V_sorted_new.reshape(-1, D).index_select(0, q_rev_flat).view(N,H,L,D)
+        return V_new.permute(0, 2, 1, 3).contiguous()
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "causal-improved-clustered", ImprovedClusteredCausalAttention,
+    [
+        ("clusters", Int),
+        ("iterations", Optional(Int, 10)),
+        ("bits", Optional(Int, 63)),
+        ("hash_bias", Optional(Bool, True)),
+        ("topk", Optional(Int, 32)),
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/linear_attention.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement unmasked linear attention."""
+import torch
+from torch.nn import Module
+from ..attention_registry import AttentionRegistry, Optional, Callable, Int, \
+    EventDispatcherInstance
+from ..events import EventDispatcher
+from ..feature_maps import elu_feature_map
+class LinearAttention(Module):
+    """Implement unmasked attention using dot product of feature maps in
+    O(N D^2) complexity.
+    Given the queries, keys and values as Q, K, V instead of computing
+        V' = softmax(Q.mm(K.t()), dim=-1).mm(V),
+    we make use of a feature map function Φ(.) and perform the following
+    computation
+        V' = normalize(Φ(Q).mm(Φ(K).t())).mm(V).
+    The above can be computed in O(N D^2) complexity where D is the
+    dimensionality of Q, K and V and N is the sequence length. Depending on the
+    feature map, however, the complexity of the attention might be limited.
+    Arguments
+    ---------
+        feature_map: callable, a callable that applies the feature map to the
+                     last dimension of a tensor (default: elu(x)+1)
+        eps: float, a small number to ensure the numerical stability of the
+             denominator (default: 1e-6)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, query_dimensions, feature_map=None, eps=1e-6,
+                 event_dispatcher=""):
+        super(LinearAttention, self).__init__()
+        self.feature_map = (
+            feature_map(query_dimensions) if feature_map else
+            elu_feature_map(query_dimensions)
+        )
+        self.eps = eps
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Apply the feature map to the queries and keys
+        self.feature_map.new_feature_map(queries.device)
+        Q = self.feature_map.forward_queries(queries)
+        K = self.feature_map.forward_keys(keys)
+        # Apply the key padding mask and make sure that the attn_mask is
+        # all_ones
+        if not attn_mask.all_ones:
+            raise RuntimeError(("LinearAttention does not support arbitrary "
+                                "attention masks"))
+        K = K * key_lengths.float_matrix[:, :, None, None]
+        # Compute the KV matrix, namely the dot product of keys and values so
+        # that we never explicitly compute the attention matrix and thus
+        # decrease the complexity
+        KV = torch.einsum("nshd,nshm->nhmd", K, values)
+        # Compute the normalizer
+        Z = 1/(torch.einsum("nlhd,nhd->nlh", Q, K.sum(dim=1))+self.eps)
+        # Finally compute and return the new values
+        V = torch.einsum("nlhd,nhmd,nlh->nlhm", Q, KV, Z)
+        return V.contiguous()
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "linear", LinearAttention,
+    [
+        ("query_dimensions", Int),
+        ("feature_map", Optional(Callable)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/local_attention.py ADDED Viewed

	@@ -0,0 +1,101 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+"""Implement local context attention."""
+from math import sqrt
+import torch
+from torch.nn import Module, Dropout
+from torch.nn import functional as F
+from ..attention_registry import AttentionRegistry, Optional, Int, Float, \
+    EventDispatcherInstance
+from ..events import EventDispatcher
+from ..local_product import local_dot_product, local_weighted_average
+class LocalAttention(Module):
+    """Implement fast local attention where a query can only attend to
+    neighboring keys.
+    In this attention module the query Q_i can only attend to a key K_j if
+    |i-j| < local_context/2.
+    Arguments
+    ---------
+        local_context: The neighborhood to consider for local attention.
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, local_context, softmax_temp=None, attention_dropout=0.1,
+                 event_dispatcher=""):
+        super(LocalAttention, self).__init__()
+        self.local_context = local_context
+        self.softmax_temp = softmax_temp
+        self.dropout = Dropout(attention_dropout)
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        """Implements the local attention.
+        The attn_mask can be anything but the only values that will be
+        considered will be the ones in the neighborhood of each query.
+        Arguments
+        ---------
+            queries: (N, L, H, E) The tensor containing the queries
+            keys: (N, S, H, E) The tensor containing the keys
+            values: (N, S, H, D) The tensor containing the values
+            attn_mask: An implementation of BaseMask that encodes where each
+                       query can attend to
+            query_lengths: An implementation of  BaseMask that encodes how
+                           many queries each sequence in the batch consists of
+            key_lengths: An implementation of BaseMask that encodes how
+                         many queries each sequence in the batch consists of
+        """
+        # Extract some shapes and compute the temperature
+        N, L, H, E = queries.shape
+        _, S, _, D = values.shape
+        context = self.local_context
+        softmax_temp = self.softmax_temp or 1./sqrt(E)
+        # Permute the dimensions to NHLE instead of NLHE
+        queries = queries.permute(0, 2, 1, 3).contiguous()
+        keys = keys.permute(0, 2, 1, 3).contiguous()
+        values = values.permute(0, 2, 1, 3).contiguous()
+        QK = local_dot_product(
+            queries,
+            keys,
+            attn_mask.additive_matrix_finite,
+            key_lengths.lengths,
+            self.local_context
+        )
+        A = self.dropout(torch.softmax(softmax_temp * QK, dim=-1))
+        V_new = local_weighted_average(A, values)
+        return V_new.permute(0, 2, 1, 3).contiguous()
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "local", LocalAttention,
+    [
+        ("local_context", Int),
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention/reformer_attention.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Implement the Reformer attention from the paper
+"Reformer the efficient transformer"."""
+from math import sqrt
+import torch
+from torch.nn import Dropout, Module
+from torch.nn.init import normal_
+from ..attention_registry import AttentionRegistry, Optional, Int, Float, \
+    Bool, EventDispatcherInstance
+from ..events import EventDispatcher
+from ..masking import FullMask
+class ReformerAttention(Module):
+    """Implement the attention module of the paper "Reformer the efficient
+    transformer"
+    Arguments
+    ---------
+        chunk_size  : Chunk size for each block (default: 32)
+        bits        : Number of bits for hashing (default: 8)
+        rounds      : Number of rounds of attention computation (default: 4)
+        masked      : If true, the query does not attend to itsself (default: False)
+        softmax_temp: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.1)
+        event_dispatcher: str or EventDispatcher instance to be used by this
+                          module for dispatching events (default: the default
+                          global dispatcher)
+    """
+    def __init__(self, chunk_size=32, bits=8, rounds=4, masked=False,
+                 softmax_temp=None, attention_dropout=0.1,
+                 event_dispatcher=""):
+        super(ReformerAttention, self).__init__()
+        self.chunk_size = chunk_size
+        self.bits = bits
+        self.rounds = rounds
+        self.masked = masked
+        self.softmax_temp = softmax_temp
+        self.dropout = Dropout(attention_dropout)
+        self.event_dispatcher = EventDispatcher.get(event_dispatcher)
+    def _normalize(self, x):
+        norms = torch.sqrt(torch.einsum("nlhe,nlhe->nlh", x, x))
+        x_normed = x / norms.unsqueeze(-1)
+        return x_normed
+    def _look_back(self, x):
+        xshape = x.shape
+        return torch.cat([
+            x.new_zeros((xshape[0], 1) + xshape[2:]),
+            torch.repeat_interleave(x, 2, dim=1)[:,:-1]
+        ], dim=1).view(xshape[0], xshape[1], 2*xshape[2], *xshape[3:])
+    def _reformer_round(self, Q, K, V, mask, softmax_temp):
+        # Hash the queries
+        N, L, H, E = Q.shape
+        planes = Q.new_empty(self.bits, E)
+        normal_(planes)
+        projected = torch.einsum("nlhe,be->nlhb", K, planes)
+        hashes = torch.argmax(
+            torch.cat([projected, -projected], dim=-1),
+            dim=-1
+        )
+        # Sort the queries in order to group them
+        group = torch.argsort(hashes, dim=1)
+        invert_group = torch.empty_like(group)
+        batch_indices = torch.arange(N, device=hashes.device).view(N, 1, 1)
+        sequence_indices = torch.arange(L, device=hashes.device).view(1, L, 1)
+        head_indices = torch.arange(H, device=hashes.device).view(1, 1, H)
+        invert_group[batch_indices, group, head_indices] = sequence_indices
+        group = group.view(N, -1, self.chunk_size, H)
+        invert_group = invert_group.view(N, -1, self.chunk_size, H)
+        batch_indices = batch_indices.unsqueeze(1)
+        head_indices = head_indices.unsqueeze(0)
+        # Reorder Q, V and mask
+        Q_grouped = Q[batch_indices, group, head_indices]
+        K_grouped = K[batch_indices, group, head_indices]
+        V_grouped = V[batch_indices, group, head_indices]
+        mask_grouped = mask[
+            batch_indices.unsqueeze(1),
+            group.unsqueeze(3),
+            self._look_back(group).unsqueeze(2)
+        ]
+        mask_grouped[:, 0, :, :Q_grouped.shape[2]] = float("-inf")
+        # When everything is masked just unmask everything because it doesn't
+        # matter what the output is at those positions
+        # This is to avoid inf/nans in the new values at masked positions
+        infmask = torch.isinf(mask_grouped)
+        infmask = torch.all(infmask, dim=3, keepdims=True)
+        mask_grouped = mask_grouped.masked_fill(infmask, 0.)
+        # Attention
+        K_grouped = self._look_back(K_grouped)
+        QQ = torch.einsum("nblhe,nbshe->nbhls", Q_grouped, K_grouped)
+        QQ = QQ + mask_grouped.permute(0, 1, 4, 2, 3)
+        A = torch.softmax(softmax_temp * QQ, dim=-1)
+        A = self.dropout(A)
+        # Values
+        V_grouped = self._look_back(V_grouped)
+        V_new = torch.einsum("nbhls,nbshe->nblhe", A, V_grouped)
+        V_new = V_new.contiguous().view(N, -1,  H, E)
+        V_new = V_new[batch_indices, invert_group, head_indices]
+        V_new = V_new.contiguous().view(N, L, H, E)
+        return V_new
+    def forward(self, queries, keys, values, attn_mask, query_lengths,
+                key_lengths):
+        # Extract the dimensions of query, key, value
+        N, L, H, E = queries.shape
+        softmax_temp = self.softmax_temp or 1./sqrt(E)
+        # Create the mask
+        mask = key_lengths.additive_matrix.unsqueeze(1).expand(N, L, L)
+        if self.masked:
+            mask = mask + torch.eye(L, device=queries.device).unsqueeze(0)*float(-1e9)
+        if not attn_mask.all_ones:
+            mask = mask + attn_mask.additive_matrix.unsqueeze(0)
+        # Get normalized Queries as Keys
+        K = self._normalize(queries)
+        # Zero the masked out keys
+        K = K * key_lengths.float_matrix.view(N, L, 1, 1)
+        V_new = 0
+        factor = 1/self.rounds
+        for i in range(self.rounds):
+            V_new = V_new + \
+                    factor * self._reformer_round(queries, K, values, mask, softmax_temp)
+        return V_new
+# Register the attention implementation so that it becomes available in our
+# builders
+AttentionRegistry.register(
+    "reformer", ReformerAttention,
+    [
+        ("chunk_size", Optional(Int, 32)),
+        ("bits", Optional(Int, 63)),
+        ("rounds", Optional(Int, 4)),
+        ("masked", Optional(Bool, False)),
+        ("softmax_temp", Optional(Float)),
+        ("attention_dropout", Optional(Float, 0.1)),
+        ("event_dispatcher", Optional(EventDispatcherInstance, ""))
+    ]
+)

smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+"""Allow for the dynamic registration of new attention implementations.
+This module provides a Registry implementation that other modules can use to
+register attention implementations for the builders.
+"""
+from .registry import \
+    AttentionRegistry, \
+    RecurrentAttentionRegistry, \
+    RecurrentCrossAttentionRegistry
+from .spec import Spec, Choice, Optional, Int, Float, Bool, Callable, \
+    EventDispatcherInstance

smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (786 Bytes). View file

smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__pycache__/registry.cpython-310.pyc ADDED Viewed

Binary file (2.27 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/__pycache__/spec.cpython-310.pyc ADDED Viewed

Binary file (4.73 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/registry.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+class Registry(object):
+    """Hold the available attention implementations and their required
+    parameters."""
+    def __init__(self):
+        self._classes = {}
+        self._class_params = {}
+        self._parameters = {}
+    def register(self, key, class_object, parameter_tuples):
+        # register the class if the key is new
+        if key in self._classes:
+            raise ValueError("{} is already registered".format(key))
+        self._classes[key] = class_object
+        # register the parameters
+        for parameter, spec in parameter_tuples:
+            if (
+                parameter in self._parameters and
+                self._parameters[parameter] != spec
+            ):
+                raise ValueError(("{} is already registered with "
+                                  "spec {!r} instead of {!r}").format(
+                                  parameter,
+                                  self._parameters[parameter],
+                                  spec
+                                ))
+            self._parameters[parameter] = spec
+        # note which parameters are needed by this class
+        self._class_params[key] = [p for p, s in parameter_tuples]
+    def __contains__(self, key):
+        return key in self._classes
+    def __getitem__(self, key):
+        return self._classes[key], self._class_params[key]
+    @property
+    def keys(self):
+        return list(self._classes.keys())
+    def contains_parameter(self, key):
+        return key in self._parameters
+    def validate_parameter(self, key, value):
+        try:
+            return self._parameters[key].get(value)
+        except Exception as e:
+            raise ValueError(("Invalid value {!r} for "
+                              "parameter {!r}").format(value, key)) from e
+AttentionRegistry = Registry()
+RecurrentAttentionRegistry = Registry()
+RecurrentCrossAttentionRegistry = Registry()

smi-ted/inference/smi_ted_light/fast_transformers/attention_registry/spec.py ADDED Viewed

	@@ -0,0 +1,126 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+"""Spec instances allow to describe and check the type and value of
+parameters."""
+from ..events import EventDispatcher
+class Spec(object):
+    """Describe and validate a parameter type.
+    Arguments
+    ---------
+        predicate: A callable that checks if the value is acceptable and
+                   returns its canonical value or raises ValueError.
+        name: A name to create a human readable description of the Spec
+    """
+    def __init__(self, predicate, name="CustomSpec"):
+        self._predicate = predicate
+        self._name = name
+    def __repr__(self):
+        return self._name
+    def check(self, x):
+        try:
+            self._predicate(x)
+            return True
+        except ValueError:
+            return False
+    def get(self, x):
+        return self._predicate(x)
+    def __eq__(self, y):
+        return self is y
+class Choice(Spec):
+    """A parameter type for a set of options.
+    Arguments
+    ---------
+        choices: A set or list of possible values for this parameter
+    """
+    def __init__(self, choices):
+        self._choices = choices
+    def get(self, x):
+        if x in self._choices:
+            return x
+        raise ValueError("{!r} is not in {!r}".format(x, self._choices))
+    def __repr__(self):
+        return "Choice({!r})".format(self._choices)
+    def __eq__(self, x):
+        if isinstance(x, Choice):
+            return self._choices == x._choices
+        return False
+class _Callable(Spec):
+    def __init__(self):
+        super(_Callable, self).__init__(None, "Callable")
+    def get(self, x):
+        if callable(x):
+            return x
+        raise ValueError("{!r} is not a callable".format(x))
+class _EventDispatcherInstance(Spec):
+    def __init__(self):
+        super(_EventDispatcherInstance, self).__init__(
+            _EventDispatcherInstance._get_event_dispatcher,
+            "EventDispatcherInstance"
+        )
+    @staticmethod
+    def _get_event_dispatcher(x):
+        if isinstance(x, str):
+            return x
+        if isinstance(x, EventDispatcher):
+            return x
+        raise ValueError("{!r} is not an event dispatcher".format(x))
+class Optional(Spec):
+    """Represent an optional parameter that can either have a value or it can
+    be None.
+    Arguments
+    ---------
+        spec: The spec for the value if it is not None
+        default: The returned value in case it is None
+    """
+    def __init__(self, spec, default=None):
+        self._other_spec = spec
+        self._default = default
+    def __repr__(self):
+        return "Optional[{!r}, {!r}]".format(self._other_spec, self._default)
+    def get(self, x):
+        if x is None:
+            return self._default
+        return self._other_spec.get(x)
+    def __eq__(self, x):
+        if isinstance(x, Optional):
+            return (
+                self._other_spec == x._other_spec and
+                self._default == x._default
+            )
+        return False
+Int = Spec(int, "Int")
+Float = Spec(float, "Float")
+Bool = Spec(bool, "Bool")
+Callable = _Callable()
+EventDispatcherInstance = _EventDispatcherInstance()

smi-ted/inference/smi_ted_light/fast_transformers/builders/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""This module implements builders that simplify building complex transformer
+architectures with different attention mechanisms.
+The main idea is to facilitate the construction of various attention layers and
+transformer encoder layers and simplify their assembly into one transformer
+module. It also allows for flexibility in the scripts as many builder
+parameters can correspond 1-1 with command line arguments.
+Example usage:
+    builder = TransformerEncoderBuilder()
+    builder.n_layers = 12
+    builder.n_heads = 8
+    builder.feed_forward_dimensions = 1024
+    builder.query_dimensions = 64
+    builder.value_dimensions = 64
+    builder.dropout = 0.1
+    builder.attention_dropout = 0.1
+    builder.attention_type = "linear"
+    transformer = builder.get()
+"""
+__all__ = [
+    "AttentionBuilder",
+    "RecurrentAttentionBuilder",
+    "RecurrentCrossAttentionBuilder"
+]
+# Import the attention implementations so that they register themselves with
+# the builder. Attention implementations external to the library should be
+# imported before using the builders.
+#
+# TODO: Should this behaviour change? Namely, should all attention
+#       implementations be imported in order to be useable? This also allows
+#       using the library even partially built, for instance.
+from ..attention import \
+    FullAttention, \
+    LinearAttention
+del FullAttention, \
+    LinearAttention
+from .attention_builders import \
+    AttentionBuilder, \
+    RecurrentAttentionBuilder, \
+    RecurrentCrossAttentionBuilder
+from .transformer_builders import \
+    TransformerEncoderBuilder, \
+    RecurrentEncoderBuilder, \
+    TransformerDecoderBuilder, \
+    RecurrentDecoderBuilder

smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.46 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/attention_builders.cpython-310.pyc ADDED Viewed

Binary file (6.49 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (2.3 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/builders/__pycache__/transformer_builders.cpython-310.pyc ADDED Viewed

Binary file (18.7 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/builders/attention_builders.py ADDED Viewed

	@@ -0,0 +1,139 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+from collections import defaultdict
+from .base import BaseBuilder
+from ..attention_registry import \
+    AttentionRegistry, \
+    RecurrentAttentionRegistry, \
+    RecurrentCrossAttentionRegistry
+class BaseAttentionBuilder(BaseBuilder):
+    def __init__(self, registry):
+        self._registry = registry
+        self._parameters = defaultdict(lambda: None)
+    @property
+    def available_attentions(self):
+        """Return a list with the available attention implementations."""
+        return self._registry.keys
+    def validate_attention_type(self, attention_type):
+        """Parse the attention type according to the rules used by `get()` and
+        check if the requested attention is constructible."""
+        return all(
+            all(t in self._registry for t in a.split(","))
+            for a in attention_type.split(":")
+        )
+    def __setattr__(self, key, value):
+        # Make sure we have normal behaviour for the class members _registry
+        # and _parameters
+        if key in ["_registry", "_parameters"]:
+            return object.__setattr__(self, key, value)
+        # Assign everything else in the parameters dictionary
+        if not self._registry.contains_parameter(key):
+            raise AttributeError(("{!r} is not a valid attention "
+                                  "parameter name").format(key))
+        self._parameters[key] = self._registry.validate_parameter(key, value)
+    def __getattr__(self, key):
+        if key in self._parameters:
+            return self._parameters[key]
+        else:
+            raise AttributeError()
+    def __repr__(self):
+        return (
+            "{}.from_kwargs(\n".format(self.__class__.__name__) +
+            "\n".join(["    {}={!r},".format(k, v)
+                       for k, v in self._parameters.items()])[:-1] +
+            "\n)"
+        )
+    def get(self, attention_type):
+        """Construct the attention implementation object and return it.
+        The passed in attention_type argument defines the attention to be
+        created. It should be a string and in its simplest form it should
+        be one of the available choices from `available_attentions`.
+        However, to enable attention decoration, namely an attention
+        implementation augmenting the functionality of another implementation,
+        the attention type can be a colon separated list of compositions like
+        the following examples:
+            - 'att1' means instantiate att1
+            - 'att2:att1' means instantiate att1 and decorate it with att2
+            - 'att3:att1,att4' means instantiate att1 and att4 and decorate
+              them with att3
+        Arguments
+        ---------
+            attention_type: A string that contains one or more keys from
+                            `available_attentions` separated with a colon to
+                            denote the decoration pattern.
+        """
+        compositions = reversed(attention_type.split(":"))
+        attentions = []
+        for c in compositions:
+            attentions = [
+                self._construct_attention(t, attentions)
+                for t in c.split(",")
+            ]
+        if len(attentions) > 1:
+            raise ValueError(("Invalid attention_type argument "
+                              "{!r}").format(attention_type))
+        return attentions[0]
+    def _construct_attention(self, attention_type, decorated=[]):
+        """Construct an attention implementation object.
+        Arguments
+        ---------
+            attention_type: A string that contains a single key from the
+                            `available_attentions`
+            decorated: A list of attention implementations to pass as arguments
+                       to be decorated
+        """
+        if attention_type not in self._registry:
+            raise ValueError(("Unknown attention type "
+                              "{!r}").format(attention_type))
+        attention, parameters = self._registry[attention_type]
+        parameter_dictionary = {
+            p: self._registry.validate_parameter(p, self._parameters[p])
+            for p in parameters
+        }
+        return attention(*decorated, **parameter_dictionary)
+class AttentionBuilder(BaseAttentionBuilder):
+    """Build attention implementations for batch sequence processing or
+    training."""
+    def __init__(self):
+        super(AttentionBuilder, self).__init__(AttentionRegistry)
+class RecurrentAttentionBuilder(BaseAttentionBuilder):
+    """Build attention implementations for autoregressive sequence
+    processing."""
+    def __init__(self):
+        super(RecurrentAttentionBuilder, self).__init__(
+            RecurrentAttentionRegistry
+        )
+class RecurrentCrossAttentionBuilder(BaseAttentionBuilder):
+    """Build attention implementations for autoregressive cross attention
+    computation."""
+    def __init__(self):
+        super(RecurrentCrossAttentionBuilder, self).__init__(
+            RecurrentCrossAttentionRegistry
+        )

smi-ted/inference/smi_ted_light/fast_transformers/builders/base.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+"""Provide a class for the others to inherit some useful functionality."""
+class BaseBuilder(object):
+    @classmethod
+    def from_kwargs(cls, **kwargs):
+        """Construct a builder and set all the keyword arguments as parameters.
+        The keyword argument strict is passed to
+        BaseBuilder.from_dictionary separately.
+        See BaseBuilder.from_dictionary().
+        """
+        strict = kwargs.pop("strict", True)
+        return cls.from_dictionary(kwargs, strict=strict)
+    @classmethod
+    def from_namespace(cls, args, strict=False):
+        """Construct a builder from an argparse Namespace.
+        To be used for building transformers from command line arguments.
+        See BaseBuilder.from_dictionary().
+        """
+        return cls.from_dictionary(vars(args), strict=strict)
+    @classmethod
+    def from_dictionary(cls, dictionary, strict=True):
+        """Construct a builder and set all the parameters in the dictionary.
+        Given a dictionary
+            d = {"foo": "bar"}
+        then
+            builder = TransformerEncoderBuilder.from_dictionary(d)
+        is equivalent to
+            builder = TransformerEncoderBuilder()
+            builder.foo = "bar"
+        Arguments
+        ---------
+            dictionary: A dictionary of parameters to set to the builder.
+            strict: bool, If a key is not a parameter and strict is set to True
+                    then a ValueError is raised, otherwise that dictionary key
+                    is ignored (default: True)
+        """
+        builder = cls()
+        for k, v in dictionary.items():
+            try:
+                setattr(builder, k, v)
+            except AttributeError:
+                if strict:
+                    raise ValueError(("The builder has no "
+                                      "parameter {!r}").format(k))
+                else:
+                    continue
+        return builder

smi-ted/inference/smi_ted_light/fast_transformers/builders/transformer_builders.py ADDED Viewed

	@@ -0,0 +1,550 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+"""Build complex transformer architectures for inference or training easily."""
+from torch.nn import LayerNorm
+from ..attention import AttentionLayer
+from ..transformers import TransformerEncoder, TransformerEncoderLayer, \
+    TransformerDecoder, TransformerDecoderLayer
+from ..recurrent.attention import \
+    RecurrentAttentionLayer, \
+    RecurrentCrossAttentionLayer
+from ..recurrent.transformers import \
+    RecurrentTransformerEncoder, RecurrentTransformerEncoderLayer, \
+    RecurrentTransformerDecoder, RecurrentTransformerDecoderLayer
+from .base import BaseBuilder
+from .attention_builders import AttentionBuilder, RecurrentAttentionBuilder, \
+    RecurrentCrossAttentionBuilder
+class BaseTransformerBuilder(BaseBuilder):
+    """Contains all the parameters for building a transformer other than the
+    attention part.
+    Classes extending the BaseTransformerBuilder should implement the `get()`
+    method that actually builds the transformer.
+    """
+    def __init__(self):
+        # transformer parameters
+        self._n_layers = 4
+        self._n_heads = 4
+        self._d_query = 64
+        self._d_value = 64
+        self._d_ff = 1024
+        self._dropout = 0.1
+        self._activation = "relu"
+        self._final_norm = True
+        self._event_dispatcher = ""  # the default global dispatcher
+    @property
+    def n_layers(self):
+        """The number of transformer layers."""
+        return self._n_layers
+    @n_layers.setter
+    def n_layers(self, val):
+        self._n_layers = val
+    @property
+    def n_heads(self):
+        """The number of heads in each transformer layer."""
+        return self._n_heads
+    @n_heads.setter
+    def n_heads(self, val):
+        self._n_heads = val
+    @property
+    def feed_forward_dimensions(self):
+        """The dimensions of the fully connected layer in the transformer
+        layers."""
+        return self._d_ff
+    @feed_forward_dimensions.setter
+    def feed_forward_dimensions(self, val):
+        self._d_ff = val
+    @property
+    def query_dimensions(self):
+        """The dimensions of the queries and keys in each attention layer."""
+        return self._d_query
+    @query_dimensions.setter
+    def query_dimensions(self, val):
+        self._d_query = val
+    @property
+    def value_dimensions(self):
+        """The dimensions of the values in each attention layer."""
+        return self._d_value
+    @value_dimensions.setter
+    def value_dimensions(self, val):
+        self._d_value = val
+    @property
+    def dropout(self):
+        """The dropout rate to be applied in the transformer encoder layer."""
+        return self._dropout
+    @dropout.setter
+    def dropout(self, val):
+        self._dropout = val
+    @property
+    def activation(self):
+        """The activation function for the transformer layer.
+        One of {'relu', 'gelu'}.
+        """
+        return self._activation
+    @activation.setter
+    def activation(self, val):
+        activations = ["relu", "gelu"]
+        if val not in activations:
+            raise ValueError(("{!r} is not one of the availabel activation "
+                              "types {!r}").format(val, activations))
+        self._activation = val
+    @property
+    def final_normalization(self):
+        """Whether to add LayerNorm as the final layer of the
+        TransformerEncoder."""
+        return self._final_norm
+    @final_normalization.setter
+    def final_normalization(self, val):
+        self._final_norm = bool(val)
+    @property
+    def event_dispatcher(self):
+        """The transformer event dispatcher either as a string or as an
+        EventDispatcher object."""
+        return self._event_dispatcher
+    @event_dispatcher.setter
+    def event_dispatcher(self, event_dispatcher):
+        self._event_dispatcher = event_dispatcher
+    def get(self):
+        """Build the transformer and return it."""
+        raise NotImplementedError()
+class BaseTransformerEncoderBuilder(BaseTransformerBuilder):
+    """Implement the logic of building a transformer encoder but leave the
+    specific layers open for changing by the inheriting classes. This allows us
+    to reuse the logic for creating both the TransformerEncoder and the
+    RecurrentTransformerEncoder.
+    Inheriting classes should implement the following:
+    - _get_attention_builder()
+    - _get_attention_layer_class()
+    - _get_encoder_class()
+    - _get_encoder_layer_class()
+    """
+    def __init__(self):
+        super(BaseTransformerEncoderBuilder, self).__init__()
+        self._attention_builder = self._get_attention_builder()
+        self._attention_type = "full"
+    def _get_attention_builder(self):
+        """Return an instance of the appropriate attention builder."""
+        raise NotImplementedError()
+    def _get_attention_layer_class(self):
+        """Return the class for the layer that projects queries keys and
+        values."""
+        raise NotImplementedError()
+    def _get_encoder_class(self):
+        """Return the class for the transformer encoder."""
+        raise NotImplementedError()
+    def _get_encoder_layer_class(self):
+        """Return the class for the transformer encoder layer."""
+        raise NotImplementedError()
+    @property
+    def attention(self):
+        """The attention builder instance."""
+        return self._attention_builder
+    @property
+    def attention_type(self):
+        """The attention implementation chosen."""
+        return self._attention_type
+    @attention_type.setter
+    def attention_type(self, val):
+        if not self._attention_builder.validate_attention_type(val):
+            raise ValueError(("{!r} is not an available attention "
+                              "type").format(val))
+        self._attention_type = val
+    def __setattr__(self, key, val):
+        # "protected" attributes are settable (probably from withing the class)
+        if key[0] == "_":
+            return super().__setattr__(key, val)
+        # Existing attributes are settable but they might also be attention
+        # parameters so try that as well
+        fail_on_exception = True
+        if hasattr(self, key):
+            super().__setattr__(key, val)
+            fail_on_exception = False
+        # Non-existing "public" attributes may be attention parameters
+        try:
+            setattr(self._attention_builder, key, val)
+        except:
+            if fail_on_exception:
+                raise
+    def get(self):
+        """Build the transformer and return it."""
+        # Set the event dispatcher to the attention builder
+        self.attention.event_dispatcher = self.event_dispatcher
+        # Extract into local variables the classes to be used
+        Encoder = self._get_encoder_class()
+        EncoderLayer = self._get_encoder_layer_class()
+        Attention = self._get_attention_layer_class()
+        model_dimensions = self.value_dimensions*self.n_heads
+        return Encoder(
+            [
+                EncoderLayer(
+                    Attention(
+                        self.attention.get(self.attention_type),
+                        model_dimensions,
+                        self.n_heads,
+                        d_keys=self.query_dimensions,
+                        d_values=self.value_dimensions,
+                        event_dispatcher=self.event_dispatcher
+                    ),
+                    model_dimensions,
+                    self.feed_forward_dimensions,
+                    self.dropout,
+                    self.activation,
+                    event_dispatcher=self.event_dispatcher
+                )
+                for _ in range(self.n_layers)
+            ],
+            (LayerNorm(model_dimensions) if self.final_normalization else None),
+            event_dispatcher=self.event_dispatcher
+        )
+class TransformerEncoderBuilder(BaseTransformerEncoderBuilder):
+    """Build a batch transformer encoder for training or processing of
+    sequences all elements at a time.
+    Example usage:
+        builder = TransformerEncoderBuilder()
+        builder.n_layers = 12
+        builder.n_heads = 8
+        builder.feed_forward_dimensions = 1024
+        builder.query_dimensions = 64
+        builder.value_dimensions = 64
+        builder.dropout = 0.1
+        builder.attention_dropout = 0.1
+        builder.attention_type = "linear"
+        transformer = builder.get()
+    """
+    def _get_attention_builder(self):
+        """Return an instance of the appropriate attention builder."""
+        return AttentionBuilder()
+    def _get_attention_layer_class(self):
+        """Return the class for the layer that projects queries keys and
+        values."""
+        return AttentionLayer
+    def _get_encoder_class(self):
+        """Return the class for the transformer encoder."""
+        return TransformerEncoder
+    def _get_encoder_layer_class(self):
+        """Return the class for the transformer encoder layer."""
+        return TransformerEncoderLayer
+class RecurrentEncoderBuilder(BaseTransformerEncoderBuilder):
+    """Build a transformer encoder for autoregressive processing of sequences.
+    Example usage:
+        builder = RecurrentEncoderBuilder()
+        builder.n_layers = 12
+        builder.n_heads = 8
+        builder.feed_forward_dimensions = 1024
+        builder.query_dimensions = 64
+        builder.value_dimensions = 64
+        builder.dropout = 0.1
+        builder.attention_dropout = 0.1
+        builder.attention_type = "linear"
+        transformer = builder.get()
+    """
+    def _get_attention_builder(self):
+        """Return an attention builder for recurrent attention."""
+        return RecurrentAttentionBuilder()
+    def _get_attention_layer_class(self):
+        """Return the class for the recurrent layer that projects queries keys
+        and values."""
+        return RecurrentAttentionLayer
+    def _get_encoder_class(self):
+        """Return the class for the recurrent transformer encoder."""
+        return RecurrentTransformerEncoder
+    def _get_encoder_layer_class(self):
+        """Return the class for the recurrent transformer encoder layer."""
+        return RecurrentTransformerEncoderLayer
+class BaseTransformerDecoderBuilder(BaseTransformerBuilder):
+    """Similar to BaseTransformerEncoderBuilder implement the logic of
+    building the transformer decoder without defining concrete layers.
+    Inheriting classes should implement the following:
+    - _get_self_attention_builder() and _get_cross_attention_builder()
+    - _get_self_attention_layer_class() and _get_cross_attention_layer_class()
+    - _get_decoder_class()
+    - _get_decoder_layer_class()
+    """
+    def __init__(self):
+        super(BaseTransformerDecoderBuilder, self).__init__()
+        self._self_attention_builder = self._get_self_attention_builder()
+        self._cross_attention_builder = self._get_cross_attention_builder()
+        self._self_attention_type = "full"
+        self._cross_attention_type = "full"
+    def _get_self_attention_builder(self):
+        """Return an instance of attention builder."""
+        raise NotImplementedError()
+    def _get_cross_attention_builder(self):
+        """Return an instance of attention builder."""
+        raise NotImplementedError()
+    def _get_self_attention_layer_class(self):
+        """Return a class to project the queries, keys and values to
+        multi-head versions."""
+        raise NotImplementedError()
+    def _get_cross_attention_layer_class(self):
+        """Return a class to project the queries, keys and values to
+        multi-head versions."""
+        raise NotImplementedError()
+    def _get_decoder_class(self):
+        """Return the class for the transformer decoder."""
+        raise NotImplementedError()
+    def _get_decoder_layer_class(self):
+        """Return the class for the transformer decoder layer."""
+        raise NotImplementedError()
+    @property
+    def self_attention(self):
+        """The attention builder instance that will be used for the self
+        attention modules."""
+        return self._self_attention_builder
+    @property
+    def self_attention_type(self):
+        """The attention implementation used for self attention."""
+        return self._self_attention_type
+    @self_attention_type.setter
+    def self_attention_type(self, val):
+        if not self._self_attention_builder.validate_attention_type(val):
+            raise ValueError(("{!r} is not an available self attention "
+                              "type").format(val))
+        self._self_attention_type = val
+    @property
+    def cross_attention(self):
+        """The attention builder instance that will be used for the cross
+        attention modules."""
+        return self._cross_attention_builder
+    @property
+    def cross_attention_type(self):
+        """The attention implementation used for cross attention."""
+        return self._cross_attention_type
+    @cross_attention_type.setter
+    def cross_attention_type(self, val):
+        if not self._cross_attention_builder.validate_attention_type(val):
+            raise ValueError(("{!r} is not an available cross attention "
+                              "type").format(val))
+        self._cross_attention_type = val
+    def __setattr__(self, key, val):
+        # "protected" attributes are settable (probably from withing the class)
+        if key[0] == "_":
+            return super().__setattr__(key, val)
+        # Existing attributes are settable but they might also be attention
+        # parameters so try that as well
+        fail_on_exception = True
+        if hasattr(self, key):
+            super().__setattr__(key, val)
+            fail_on_exception = False
+        # Non-existing "public" attributes may be attention parameters
+        try:
+            setattr(self._self_attention_builder, key, val)
+            setattr(self._cross_attention_builder, key, val)
+        except:
+            if fail_on_exception:
+                raise
+    def get(self):
+        """Build the transformer and return it."""
+        # Set the event dispatcher to attention builders
+        self.self_attention.event_dispatcher = self.event_dispatcher
+        self.cross_attention.event_dispatcher = self.event_dispatcher
+        # Extract into local variables the classes to be used
+        Decoder = self._get_decoder_class()
+        DecoderLayer = self._get_decoder_layer_class()
+        SelfAttention = self._get_self_attention_layer_class()
+        CrossAttention = self._get_cross_attention_layer_class()
+        model_dimensions = self.value_dimensions*self.n_heads
+        return Decoder(
+            [
+                DecoderLayer(
+                    SelfAttention(
+                        self.self_attention.get(self.self_attention_type),
+                        model_dimensions,
+                        self.n_heads,
+                        d_keys=self.query_dimensions,
+                        d_values=self.value_dimensions,
+                        event_dispatcher=self.event_dispatcher
+                    ),
+                    CrossAttention(
+                        self.cross_attention.get(self.cross_attention_type),
+                        model_dimensions,
+                        self.n_heads,
+                        d_keys=self.query_dimensions,
+                        d_values=self.value_dimensions,
+                        event_dispatcher=self.event_dispatcher
+                    ),
+                    model_dimensions,
+                    self.feed_forward_dimensions,
+                    self.dropout,
+                    self.activation,
+                    event_dispatcher=self.event_dispatcher
+                )
+                for _ in range(self.n_layers)
+            ],
+            (LayerNorm(model_dimensions) if self.final_normalization else None),
+            event_dispatcher=self.event_dispatcher
+        )
+class TransformerDecoderBuilder(BaseTransformerDecoderBuilder):
+    """Build a transformer decoder for training or processing of sequences all
+    elements at a time.
+    Example usage:
+        builder = TransformerDecoderBuilder()
+        builder.n_layers = 12
+        builder.n_heads = 8
+        builder.feed_forward_dimensions = 1024
+        builder.query_dimensions = 64
+        builder.value_dimensions = 64
+        builder.dropout = 0.1
+        builder.attention_dropout = 0.1
+        builder.self_attention_type = "full"
+        builder.cross_attention_type = "full"
+        transformer = builder.get()
+    """
+    def _get_self_attention_builder(self):
+        """Return an attention builder for creating non-recurrent attention
+        variants."""
+        return AttentionBuilder()
+    def _get_cross_attention_builder(self):
+        """Return an attention builder for creating non-recurrent attention
+        variants."""
+        return AttentionBuilder()
+    def _get_self_attention_layer_class(self):
+        """Return the non-recurrent attention layer to project queries, keys
+        and values."""
+        return AttentionLayer
+    def _get_cross_attention_layer_class(self):
+        """Return the non-recurrent attention layer to project queries, keys
+        and values."""
+        return AttentionLayer
+    def _get_decoder_class(self):
+        """Return the transformer decoder class."""
+        return TransformerDecoder
+    def _get_decoder_layer_class(self):
+        """Return the transformer decoder layer class."""
+        return TransformerDecoderLayer
+class RecurrentDecoderBuilder(BaseTransformerDecoderBuilder):
+    """Build a transformer decoder for processing of sequences in
+    autoregressive fashion.
+    Example usage:
+        builder = RecurrentDecoderBuilder()
+        builder.n_layers = 12
+        builder.n_heads = 8
+        builder.feed_forward_dimensions = 1024
+        builder.query_dimensions = 64
+        builder.value_dimensions = 64
+        builder.dropout = 0.1
+        builder.attention_dropout = 0.1
+        builder.self_attention_type = "full"
+        builder.cross_attention_type = "full"
+        transformer = builder.get()
+    """
+    def _get_self_attention_builder(self):
+        """Return an attention builder for creating non-recurrent attention
+        variants."""
+        return RecurrentAttentionBuilder()
+    def _get_cross_attention_builder(self):
+        """Return an attention builder for creating non-recurrent attention
+        variants."""
+        return RecurrentCrossAttentionBuilder()
+    def _get_self_attention_layer_class(self):
+        """Return the non-recurrent attention layer to project queries, keys
+        and values."""
+        return RecurrentAttentionLayer
+    def _get_cross_attention_layer_class(self):
+        """Return the non-recurrent attention layer to project queries, keys
+        and values."""
+        return RecurrentCrossAttentionLayer
+    def _get_decoder_class(self):
+        """Return the transformer decoder class."""
+        return RecurrentTransformerDecoder
+    def _get_decoder_layer_class(self):
+        """Return the transformer decoder layer class."""
+        return RecurrentTransformerDecoderLayer

smi-ted/inference/smi_ted_light/fast_transformers/causal_product/__init__.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+import torch
+from .causal_product_cpu import causal_dot_product as causal_dot_product_cpu, \
+    causal_dot_backward as causal_dot_backward_cpu
+try:
+    from .causal_product_cuda import \
+        causal_dot_product as causal_dot_product_cuda, \
+        causal_dot_backward as causal_dot_backward_cuda
+except ImportError:
+    causal_dot_product_cuda = causal_dot_backward_cuda = None
+class CausalDotProduct(torch.autograd.Function):
+    """Compute the weighted sum of values but attending only to previous
+    values."""
+    dot = {
+        "cpu": causal_dot_product_cpu,
+        "cuda": causal_dot_product_cuda
+    }
+    dot_backward = {
+        "cpu": causal_dot_backward_cpu,
+        "cuda": causal_dot_backward_cuda
+    }
+    @staticmethod
+    def forward(ctx, Q, K, V):
+        # Save the inputs for the gradient computation
+        ctx.save_for_backward(Q, K, V)
+        # Create the output tensor
+        device = Q.device
+        N, H, L, _ = Q.shape
+        _, _, _, M = V.shape
+        product = torch.zeros((N, H, L, M), device=device)
+        # Actually perform the dot product
+        CausalDotProduct.dot[device.type](
+            Q.data,
+            K.data,
+            V.data,
+            product
+        )
+        return product
+    @staticmethod
+    def backward(ctx, grad_out):
+        # Extract the saved tensors
+        Q, K, V = ctx.saved_tensors
+        # Allocate memory for the gradients
+        grad_Q = torch.zeros_like(Q)
+        grad_K = torch.zeros_like(K)
+        grad_V = torch.zeros_like(V)
+        # Actually compute the gradients
+        CausalDotProduct.dot_backward[Q.device.type](
+            Q.data,
+            K.data,
+            V.data,
+            grad_out,
+            grad_Q,
+            grad_K,
+            grad_V
+        )
+        return grad_Q, grad_K, grad_V
+# Alias the autograd functions to python style snake case naming
+causal_dot_product = CausalDotProduct.apply

smi-ted/inference/smi_ted_light/fast_transformers/causal_product/causal_product_cpu.cpython-39-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84f32370e707beebd8fee88f356fb62721096142265895a5a8e9872063c04595
+size 140928

smi-ted/inference/smi_ted_light/fast_transformers/clustering/__init__.py ADDED Viewed

File without changes

smi-ted/inference/smi_ted_light/fast_transformers/clustering/hamming/__init__.py ADDED Viewed

	@@ -0,0 +1,115 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>,
+# Apoorv Vyas <avyas@idiap.ch>
+#
+import numpy as np
+import torch
+from .cluster_cpu import cluster as cluster_cpu
+try:
+    from .cluster_cuda import cluster as cluster_gpu
+except ImportError:
+    pass
+def cluster(
+    hashes,
+    lengths,
+    groups=None,
+    counts=None,
+    centroids=None,
+    distances=None,
+    bitcounts=None,
+    clusters=30,
+    iterations=10,
+    bits=32
+):
+    """Cluster hashes using a few iterations of K-Means with hamming distance.
+    All the tensors default initialized to None are optional buffers to avoid
+    memory allocations. distances and bitcounts are only used by the CUDA
+    version of this call. clusters will be ignored if centroids is provided.
+    Arguments
+    ---------
+        hashes: A long tensor of shape (N, H, L) containing a hashcode for each
+                query.
+        lengths: An int tensor of shape (N,) containing the sequence length for
+                 each sequence in hashes.
+        groups: An int tensor buffer of shape (N, H, L) contaning the cluster
+                in which the corresponding hash belongs to.
+        counts: An int tensor buffer of shape (N, H, K) containing the number
+                of elements in each cluster.
+        centroids: A long tensor buffer of shape (N, H, K) containing the
+                   centroid for each cluster.
+        distances: An int tensor of shape (N, H, L) containing the distance to
+                   the closest centroid for each hash.
+        bitcounts: An int tensor of shape (N, H, K, bits) containing the number
+                   of elements that have 1 for a given bit.
+        clusters: The number of clusters to use for each sequence. It is
+                  ignored if centroids is not None.
+        iterations: How many k-means iterations to perform.
+        bits: How many of the least-significant bits in hashes to consider.
+    Returns
+    -------
+        groups and counts as defined above.
+    """
+    device = hashes.device
+    N, H, L = hashes.shape
+    # Unfortunately cpu and gpu have different APIs so the entire call must be
+    # surrounded by an if-then-else
+    if device.type == "cpu":
+        if groups is None:
+            groups = torch.empty((N, H, L), dtype=torch.int32)
+        if centroids is None:
+            centroids = torch.empty((N, H, clusters), dtype=torch.int64)
+            centroids = hashes[:, :, np.random.choice(L, size=[clusters], replace=False)]
+        K = centroids.shape[2]
+        if counts is None:
+            counts = torch.empty((N, H, K), dtype=torch.int32)
+        cluster_cpu(
+            hashes, lengths,
+            centroids, groups, counts,
+            iterations, bits
+        )
+        return groups, counts
+    else:
+        if groups is None:
+            groups = torch.empty((N, H, L), dtype=torch.int32, device=device)
+        if centroids is None:
+            centroids = torch.empty((N, H, clusters), dtype=torch.int64,
+                                    device=device)
+            centroids = hashes[:, :, np.random.choice(L, size=[clusters], replace=False)]
+        K = centroids.numel() // N // H
+        #K = clusters
+        if counts is None:
+            counts = torch.empty((N, H, K), dtype=torch.int32, device=device)
+        if distances is None:
+            distances = torch.empty((N, H, L), dtype=torch.int32,
+                                    device=device)
+        if bitcounts is None:
+            bitcounts = torch.empty((N, H, K, bits), dtype=torch.int32,
+                                    device=device)
+        groups = groups.view(N, H, L)
+        counts = counts.view(N, H, K)
+        centroids = centroids.view(N, H, K)
+        distances = distances.view(N, H, L)
+        bitcounts = bitcounts.view(N, H, K, -1)
+        cluster_gpu(
+            hashes, lengths,
+            centroids, distances, bitcounts, groups, counts,
+            iterations, bits
+        )
+        return groups, counts

smi-ted/inference/smi_ted_light/fast_transformers/clustering/hamming/cluster_cpu.cpython-39-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2bd8f761d6e1efdeea33665cad8702b5c07d1a0db728d19cf332c4383510d45
+size 139824

smi-ted/inference/smi_ted_light/fast_transformers/events/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+"""This module implements a basic event system that allows the transformer
+internal components to make available any tensor with minimal overhead."""
+from .event import Event, AttentionEvent, QKVEvent
+from .event_dispatcher import EventDispatcher

smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (556 Bytes). View file

smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/event.cpython-310.pyc ADDED Viewed

Binary file (2.21 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/event_dispatcher.cpython-310.pyc ADDED Viewed

Binary file (3.5 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/events/__pycache__/filters.cpython-310.pyc ADDED Viewed

Binary file (5.82 kB). View file

smi-ted/inference/smi_ted_light/fast_transformers/events/event.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+class Event(object):
+    """The Event is the base class for all events that are dispatched from any
+    transformer module.
+    This class defines only the basic attributes of an event without any
+    payload.
+    Arguments
+    ---------
+        source: torch.nn.Module instance that dispatched this event
+    """
+    def __init__(self, source):
+        self.source = source
+class AttentionEvent(Event):
+    """An event containing an attention matrix.
+    Arguments
+    ---------
+        source: torch.nn.Module instance that dispatched this event
+        attention_matrix: torch.tensor of the multihead attention matrix
+                          computed in the corresponding attention layer
+    """
+    def __init__(self, source, attention_matrix):
+        super(AttentionEvent, self).__init__(source)
+        self.attention_matrix = attention_matrix
+class QKVEvent(Event):
+    """An event containing the queries, keys and values projected in their
+    multiple heads.
+    Arguments
+    ---------
+        source: torch.nn.Module instance that dispatched this event
+        queries: torch.tensor containing the queries in shape NLHE
+        keys: torch.tensor containing the keys in shape NSHE
+        values: torch.tensor containing the values in shape NSHD
+    """
+    def __init__(self, source, queries, keys, values):
+        super(QKVEvent, self).__init__(source)
+        self.queries = queries
+        self.keys = keys
+        self.values = values

smi-ted/inference/smi_ted_light/fast_transformers/events/event_dispatcher.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+from collections import OrderedDict
+from .event import Event
+from .filters import event_class
+class EventDispatcher(object):
+    """An EventDispatcher is a simple way to implement an observer pattern for
+    loose coupling of components. In our case it is used so that the internals
+    of large neural networks can communicate with the outside world in an
+    agnostic and efficient way.
+    Example usage
+    -------------
+        from fast_transformers.events import EventDispatcher, AttentionEvent
+        from fast_transformers.events.filters import \
+            layer_name_contains
+        def attention_event_handler(event):
+            print(event.attention_matrix)
+        ed = EventDispatcher()
+        ed.listen(AttentionEvent, attention_event_handler)
+        ed.listen(
+            AttentionEvent & layer_name_contains("layers.12"),
+            attention_event_handler
+        )
+    """
+    _dispatchers = {}
+    def __init__(self):
+        self._listeners = OrderedDict()
+    def listen(self, event_filter, event_handler):
+        """Add an event handler for the events that pass the event filter.
+        Arguments
+        ---------
+            event_filter: callable or Event class to define for which events
+                          this handler will be called
+            event_handler: callable that accepts an instance of Event
+        """
+        if isinstance(event_filter, type) and issubclass(event_filter, Event):
+            event_filter = event_class(event_filter)
+        self._listeners[event_handler] = event_filter
+    def remove(self, event_handler):
+        """Remove the event_handler from the listeners so that no more events
+        are dispatched to this handler."""
+        self._listeners.pop(event_handler, None)
+    def clear(self):
+        """Remove all listeners from the event dispatcher."""
+        self._listeners.clear()
+    def dispatch(self, event):
+        """Dispatch an event to the listeners.
+        Arguments
+        ---------
+            event: Event instance
+        """
+        for event_handler, event_filter in self._listeners.items():
+            if event_filter(event):
+                event_handler(event)
+    @classmethod
+    def get(cls, key=""):
+        """Factory method for creating global event dispatchers for loosely
+        coupling parts of a larger codebase.
+        Since global objects are a complete antipattern, we suggest that this
+        is only used to set a default value for an event dispatcher passed as
+        an argument.
+        Argument
+        --------
+            key: A key to uniquely identify a dispatcher or an instance of a
+                 dispatcher to be returned as is
+        """
+        if isinstance(key, cls):
+            return key
+        if key not in cls._dispatchers:
+            cls._dispatchers[key] = cls()
+        return cls._dispatchers[key]

smi-ted/inference/smi_ted_light/fast_transformers/events/filters.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+"""Define composable functions to filter events."""
+import weakref
+from .event import Event
+class EventFilter(object):
+    """EventFilter instances are predicates (ie functions that return True or
+    False) to be used with an event dispatcher for filtering event
+    instances.
+    The main benefit from using raw functions is that an EventFilter composes
+    very easily using operators such as &, |, ~.
+    Example
+    --------
+        event_filter = AttentionEvent | layer_name_contains("layers.1")
+        event_filter = from_layer(transformer.layers[2].attention)
+        event_filter = (
+            AttentionEvent &
+            lambda ev: torch.isnan(ev.attention_matrix).any()
+        )
+    """
+    def __call__(self, event):
+        raise NotImplementedError()
+    def _to_event_filter(self, other):
+        if isinstance(other, EventFilter):
+            return other
+        if isinstance(other, type) and issubclass(other, Event):
+            return event_class(other)
+        if callable(other):
+            return CallableEventFilter(other)
+        return NotImplemented
+    def __and__(self, other):
+        other = self._to_event_filter(other)
+        if other is NotImplemented:
+            return other
+        return CallableEventFilter(lambda ev: self(ev) and other(ev))
+    def __rand__(self, other):
+        other = self._to_event_filter(other)
+        if other is NotImplemented:
+            return other
+        return CallableEventFilter(lambda ev: other(ev) and self(ev))
+    def __or__(self, other):
+        other = self._to_event_filter(other)
+        if other is NotImplemented:
+            return other
+        return CallableEventFilter(lambda ev: self(ev) or other(ev))
+    def __ror__(self, other):
+        other = self._to_event_filter(other)
+        if other is NotImplemented:
+            return other
+        return CallableEventFilter(lambda ev: other(ev) or self(ev))
+    def __invert__(self):
+        return CallableEventFilter(lambda ev: not self(ev))
+class CallableEventFilter(EventFilter):
+    """Wrap a function with an EventFilter object."""
+    def __init__(self, event_filter):
+        self._event_filter = event_filter
+    def __call__(self, event):
+        return self._event_filter(event)
+class LayerNameEventFilter(EventFilter):
+    """A LayerNameEventFilter allows to filter events based on a human readable
+    name of the layer that emitted them.
+    Note that LayerNameEventFilter keeps a weak reference to all modules which
+    means that it cannot be used to prevent modules from being garbage
+    collected.
+    Arguments
+    ---------
+        root: torch.nn.Module instance that represents the root container
+        name_filter: callable, that returns true if the name
+    """
+    def __init__(self, root, name_filter):
+        self._names = {
+            weakref.ref(m): n
+            for n, m in root.named_modules()
+        }
+        self._name_filter = name_filter
+    def __call__(self, event):
+        name = self._names.get(weakref.ref(event.source), None)
+        if name is None:
+            return False
+        return self._name_filter(name)
+def event_class(klass):
+    """Select events that are instances of `klass`.
+    Arguments
+    ---------
+        klass: A class to check the event instance against
+    Returns
+    -------
+        An instance of EventFilter
+    """
+    return CallableEventFilter(lambda ev: isinstance(ev, klass))
+def from_layer(layer):
+    """Select events that are dispatched from the `layer`.
+    Arguments
+    ---------
+        layer: An instance of torch.nn.Module to check against the event source
+    Returns
+    -------
+        An instance of EventFilter
+    """
+    return CallableEventFilter(lambda ev: ev.source is layer)
+def layer_name_contains(root, name):
+    """Select events that contain `name` in their human readable name.
+    We use root.named_modules() to get human readable names for the layers.
+    """
+    return LayerNameEventFilter(root, lambda n: name in n)

smi-ted/inference/smi_ted_light/fast_transformers/feature_maps/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+#
+# Copyright (c) 2020 Idiap Research Institute, http://www.idiap.ch/
+# Written by Angelos Katharopoulos <angelos.katharopoulos@idiap.ch>
+#
+"""Implementations of feature maps to be used with linear attention and causal
+linear attention."""
+from .base import elu_feature_map, ActivationFunctionFeatureMap
+from .fourier_features import RandomFourierFeatures, Favor, \
+    SmoothedRandomFourierFeatures, GeneralizedRandomFeatures

smi-ted/inference/smi_ted_light/fast_transformers/feature_maps/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (614 Bytes). View file

smi-ted/inference/smi_ted_light/fast_transformers/feature_maps/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (3.42 kB). View file