Spaces:

Jongmo
/

Prove_KCL

Sleeping

App Files Files Community

Jongmo commited on Jun 13, 2024

Commit

a5bbcdb

verified ·

1 Parent(s): 49664ed

Upload 25 files

Browse files

Files changed (25) hide show

utils/__init__.py +0 -0
utils/__pycache__/bert_model.cpython-39.pyc +0 -0
utils/__pycache__/callbacks.cpython-39.pyc +0 -0
utils/__pycache__/file_utils.cpython-39.pyc +0 -0
utils/__pycache__/finetune.cpython-39.pyc +0 -0
utils/__pycache__/lightning_base.cpython-39.pyc +0 -0
utils/__pycache__/sentence_retrieval_model.cpython-39.pyc +0 -0
utils/__pycache__/sentence_retrieval_module.cpython-39.pyc +0 -0
utils/__pycache__/textual_entailment_module.cpython-39.pyc +0 -0
utils/__pycache__/utils_graph2text.cpython-39.pyc +0 -0
utils/__pycache__/utils_verbalisation_module.cpython-39.pyc +0 -0
utils/__pycache__/verbalisation_module.cpython-39.pyc +0 -0
utils/__pycache__/wikidata_utils.cpython-39.pyc +0 -0
utils/bert_model.py +775 -0
utils/callbacks.py +140 -0
utils/file_utils.py +249 -0
utils/finetune.py +633 -0
utils/lightning_base.py +418 -0
utils/sentence_retrieval_model.py +20 -0
utils/sentence_retrieval_module.py +77 -0
utils/textual_entailment_module.py +94 -0
utils/utils_graph2text.py +114 -0
utils/utils_verbalisation_module.py +610 -0
utils/verbalisation_module.py +300 -0
utils/wikidata_utils.py +173 -0

utils/__init__.py ADDED Viewed

File without changes

utils/__pycache__/bert_model.cpython-39.pyc ADDED Viewed

Binary file (30.6 kB). View file

utils/__pycache__/callbacks.cpython-39.pyc ADDED Viewed

Binary file (4.9 kB). View file

utils/__pycache__/file_utils.cpython-39.pyc ADDED Viewed

Binary file (6.81 kB). View file

utils/__pycache__/finetune.cpython-39.pyc ADDED Viewed

Binary file (20.2 kB). View file

utils/__pycache__/lightning_base.cpython-39.pyc ADDED Viewed

Binary file (13.5 kB). View file

utils/__pycache__/sentence_retrieval_model.cpython-39.pyc ADDED Viewed

Binary file (1.11 kB). View file

utils/__pycache__/sentence_retrieval_module.cpython-39.pyc ADDED Viewed

Binary file (2.52 kB). View file

utils/__pycache__/textual_entailment_module.cpython-39.pyc ADDED Viewed

Binary file (2.65 kB). View file

utils/__pycache__/utils_graph2text.cpython-39.pyc ADDED Viewed

Binary file (3.12 kB). View file

utils/__pycache__/utils_verbalisation_module.cpython-39.pyc ADDED Viewed

Binary file (23.9 kB). View file

utils/__pycache__/verbalisation_module.cpython-39.pyc ADDED Viewed

Binary file (7.37 kB). View file

utils/__pycache__/wikidata_utils.cpython-39.pyc ADDED Viewed

Binary file (5.29 kB). View file

utils/bert_model.py ADDED Viewed

	@@ -0,0 +1,775 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import copy
+import json
+import logging
+import math
+import os
+import shutil
+import tarfile
+import tempfile
+import sys
+from io import open
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from utils.file_utils import cached_path
+logger = logging.getLogger(__name__)
+PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+}
+CONFIG_NAME = 'bert_config.json'
+WEIGHTS_NAME = 'pytorch_model.bin'
+TF_WEIGHTS_NAME = 'model.ckpt'
+def load_tf_weights_in_bert(model, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m"] for n in name):
+            print("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        print("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def swish(x):
+    return x * torch.sigmoid(x)
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs BertConfig.
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+    def __repr__(self):
+        return str(self.to_json_string())
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except ImportError:
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(BertPreTrainedModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
+                        from_tf=False, *inputs, **kwargs):
+        """
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+        Params:
+            pretrained_model_name_or_path: either:
+                - a str with the name of a pre-trained model to load selected in the list of:
+                    . `bert-base-uncased`
+                    . `bert-large-uncased`
+                    . `bert-base-cased`
+                    . `bert-large-cased`
+                    . `bert-base-multilingual-uncased`
+                    . `bert-base-multilingual-cased`
+                    . `bert-base-chinese`
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
+            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
+        else:
+            archive_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            logger.error(
+                "Model name '{}' was not found in model name list ({}). "
+                "We assumed '{}' was a path or url but couldn't find any file "
+                "associated to this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
+                    archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info("loading archive file {}".format(archive_file))
+        else:
+            logger.info("loading archive file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+        tempdir = None
+        if os.path.isdir(resolved_archive_file) or from_tf:
+            serialization_dir = resolved_archive_file
+        else:
+            # Extract archive to temp dir
+            tempdir = tempfile.mkdtemp()
+            logger.info("extracting archive file {} to temp dir {}".format(
+                resolved_archive_file, tempdir))
+            with tarfile.open(resolved_archive_file, 'r:gz') as archive:
+                archive.extractall(tempdir)
+            serialization_dir = tempdir
+        # Load config
+        config_file = os.path.join(serialization_dir, CONFIG_NAME)
+        config = BertConfig.from_json_file(config_file)
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
+            state_dict = torch.load(weights_path, map_location='cpu' if not torch.cuda.is_available() else None)
+        if tempdir:
+            # Clean up temp dir
+            shutil.rmtree(tempdir)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
+            return load_tf_weights_in_bert(model, weights_path)
+        # Load from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+        start_prefix = ''
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+            start_prefix = 'bert.'
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        return model
+class BertModel(BertPreTrainedModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+class BertForSequenceEncoder(BertPreTrainedModel):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    num_labels = 2
+    model = BertForSequenceClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config):
+        super(BertForSequenceEncoder, self).__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.apply(self.init_bert_weights)
+    def forward(self, input_ids, attention_mask, token_type_ids):
+        output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        output = self.dropout(output)
+        pooled_output = self.dropout(pooled_output)
+        return output, pooled_output

utils/callbacks.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import logging
+import os
+from pathlib import Path
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from pytorch_lightning.utilities import rank_zero_only
+from utils.utils_verbalisation_module import save_json
+from pytorch_lightning.utilities import rank_zero_info
+def count_trainable_parameters(model):
+    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+    params = sum([np.prod(p.size()) for p in model_parameters])
+    return params
+logger = logging.getLogger(__name__)
+class Seq2SeqLoggingCallback(pl.Callback):
+    def on_batch_end(self, trainer, pl_module):
+        lrs = {f"lr_group_{i}": param["lr"] for i, param in enumerate(pl_module.trainer.optimizers[0].param_groups)}
+        pl_module.logger.log_metrics(lrs)
+    @rank_zero_only
+    def _write_logs(
+        self, trainer: pl.Trainer, pl_module: pl.LightningModule, type_path: str, save_generations=True
+    ) -> None:
+        logger.info(f"***** {type_path} results at step {trainer.global_step:05d} *****")
+        metrics = trainer.callback_metrics
+        #print(metrics.keys())
+        new_metrics = {}
+        ms = ["log", "progress_bar", "preds"]
+        for k, v in metrics.items():
+            ver = True
+            for m in ms:
+                if m in k:
+                    ver = False
+                    break
+            if ver:
+                new_metrics[k] = v
+        print(new_metrics)
+        trainer.logger.log_metrics(new_metrics)
+        # Log results
+        od = Path(pl_module.hparams.output_dir)
+        if type_path == "test":
+            results_file = od / "test_results.txt"
+            generations_file = od / "test_generations.txt"
+        else:
+            # this never gets hit. I prefer not to save intermediate generations, and results are in metrics.json
+            # If people want this it will be easy enough to add back.
+            results_file = od / f"{type_path}_results/{trainer.global_step:05d}.txt"
+            generations_file = od / f"{type_path}_generations/{trainer.global_step:05d}.txt"
+            results_file.parent.mkdir(exist_ok=True)
+            generations_file.parent.mkdir(exist_ok=True)
+        with open(results_file, "a+") as writer:
+            for key in sorted(metrics):
+                if key in ["log", "progress_bar", "preds"]:
+                    continue
+                try:
+                    val = metrics[key]
+                    if isinstance(val, torch.Tensor):
+                        val = val.item()
+                    msg = f"{key}: {val:.6f}\n"
+                    writer.write(msg)
+                except:
+                    pass
+        if not save_generations:
+            return
+        if "preds" in metrics:
+            content = "\n".join(metrics["preds"])
+            generations_file.open("w+").write(content)
+    @rank_zero_only
+    def on_train_start(self, trainer, pl_module):
+        try:
+            npars = pl_module.model.model.num_parameters()
+        except AttributeError:
+            npars = pl_module.model.num_parameters()
+        n_trainable_pars = count_trainable_parameters(pl_module)
+        # mp stands for million parameters
+        trainer.logger.log_metrics({"n_params": npars, "mp": npars / 1e6, "grad_mp": n_trainable_pars / 1e6})
+    @rank_zero_only
+    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        save_json(pl_module.metrics, pl_module.metrics_save_path)
+        return self._write_logs(trainer, pl_module, "test")
+    @rank_zero_only
+    def on_validation_end(self, trainer: pl.Trainer, pl_module):
+        save_json(pl_module.metrics, pl_module.metrics_save_path)
+        rank_zero_info("***** Validation results *****")
+        metrics = trainer.callback_metrics
+        # Log results
+        for key in sorted(metrics):
+            if key not in ["log", "progress_bar", "preds"]:
+                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
+        # Uncommenting this will save val generations
+        # return self._write_logs(trainer, pl_module, "valid")
+def get_checkpoint_callback(output_dir, metric, save_top_k=1, lower_is_better=False):
+    """Saves the best model by validation ROUGE2 score."""
+    if metric == "rouge2":
+        exp = "{val_avg_rouge2:.4f}-{step_count}"
+    elif metric == "bleu":
+        exp = "{val_avg_bleu:.4f}-{step_count}"
+    elif metric == "loss":
+        exp = "{val_avg_loss:.4f}-{step_count}"
+    else:
+        raise NotImplementedError(
+            f"seq2seq callbacks only support rouge2, bleu and loss, got {metric}, You can make your own by adding to this function."
+        )
+    checkpoint_callback = ModelCheckpoint(
+        filepath=os.path.join(output_dir, exp),
+        monitor=f"val_{metric}",
+        mode="min" if "loss" in metric else "max",
+        save_top_k=save_top_k,
+        period=0,  # maybe save a checkpoint every time val is run, not just end of epoch.
+    )
+    return checkpoint_callback
+def get_early_stopping_callback(metric, patience):
+    return EarlyStopping(
+        monitor=f"val_{metric}",  # does this need avg?
+        mode="min" if "loss" in metric else "max",
+        patience=patience,
+        verbose=True,
+    )

utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+from __future__ import (absolute_import, division, print_function, unicode_literals)
+import json
+import logging
+import os
+import shutil
+import tempfile
+from functools import wraps
+from hashlib import sha256
+import sys
+from io import open
+import boto3
+import requests
+from botocore.exceptions import ClientError
+from tqdm import tqdm
+try:
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+try:
+    from pathlib import Path
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                   Path.home() / '.pytorch_pretrained_bert'))
+except AttributeError:
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                              os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    """
+    url_bytes = url.encode('utf-8')
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+    if etag:
+        etag_bytes = etag.encode('utf-8')
+        etag_hash = sha256(etag_bytes)
+        filename += '.' + etag_hash.hexdigest()
+    return filename
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+    meta_path = cache_path + '.json'
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata['url']
+    etag = metadata['etag']
+    return url, etag
+def cached_path(url_or_filename, cache_dir=None):
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    parsed = urlparse(url_or_filename)
+    if parsed.scheme in ('http', 'https', 's3'):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, cache_dir)
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        return url_or_filename
+    elif parsed.scheme == '':
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+    return wrapper
+@s3_request
+def s3_etag(url):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+@s3_request
+def s3_get(url, temp_file):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3")
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+def http_get(url, temp_file):
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get('Content-Length')
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(unit="B", total=total)
+    for chunk in req.iter_content(chunk_size=1024):
+        if chunk: # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+def get_from_cache(url, cache_dir=None):
+    """
+    Given a URL, look for the corresponding dataset in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    """
+    if cache_dir is None:
+        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    if not os.path.exists(cache_dir):
+        os.makedirs(cache_dir)
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+        etag = s3_etag(url)
+    else:
+        response = requests.head(url, allow_redirects=True)
+        if response.status_code != 200:
+            raise IOError("HEAD request failed for url {} with status code {}"
+                          .format(url, response.status_code))
+        etag = response.headers.get("ETag")
+    filename = url_to_filename(url, etag)
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with tempfile.NamedTemporaryFile() as temp_file:
+            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+            # GET file object
+            if url.startswith("s3://"):
+                s3_get(url, temp_file)
+            else:
+                http_get(url, temp_file)
+            # we are copying the file before closing it, so flush to avoid truncation
+            temp_file.flush()
+            # shutil.copyfileobj() starts at the current position, so go to the start
+            temp_file.seek(0)
+            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
+            with open(cache_path, 'wb') as cache_file:
+                shutil.copyfileobj(temp_file, cache_file)
+            logger.info("creating metadata file for %s", cache_path)
+            meta = {'url': url, 'etag': etag}
+            meta_path = cache_path + '.json'
+            with open(meta_path, 'w', encoding="utf-8") as meta_file:
+                json.dump(meta, meta_file)
+            logger.info("removing temp file %s", temp_file.name)
+    return cache_path
+def read_set_from_file(filename):
+    '''
+    Extract a de-duped collection (set) of text from a file.
+    Expected file format is one item per line.
+    '''
+    collection = set()
+    with open(filename, 'r', encoding='utf-8') as file_:
+        for line in file_:
+            collection.add(line.rstrip())
+    return collection
+def get_file_extension(path, dot=True, lower=True):
+    ext = os.path.splitext(path)[1]
+    ext = ext if dot else ext[1:]
+    return ext.lower() if lower else ext

utils/finetune.py ADDED Viewed

	@@ -0,0 +1,633 @@

+#!/usr/bin/env python
+import argparse
+import glob
+import logging
+import os
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Tuple
+import pdb
+import numpy as np
+import pytorch_lightning as pl
+import torch
+from torch.utils.data import DataLoader
+from pytorch_lightning.utilities import rank_zero_info
+from utils.callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
+from transformers import MBartTokenizer, T5ForConditionalGeneration
+from transformers.models.bart.modeling_bart import shift_tokens_right
+from utils.utils_verbalisation_module import (
+    ROUGE_KEYS,
+    LegacySeq2SeqDataset,
+    Seq2SeqDataset,
+    assert_all_frozen,
+    calculate_bleu,
+    calculate_rouge,
+    flatten_list,
+    freeze_embeds,
+    freeze_params,
+    label_smoothed_nll_loss,
+    lmap,
+    pickle_save,
+    save_json,
+    use_task_specific_params,
+)
+from utils.utils_graph2text import convert_text, eval_meteor, eval_bleu, eval_chrf, eval_meteor_test_webnlg, eval_chrf_test_webnlg
+# need the parent dir module
+sys.path.insert(2, str(Path(__file__).resolve().parents[1]))
+from utils.lightning_base import BaseTransformer, add_generic_args, generic_train  # noqa
+logger = logging.getLogger(__name__)
+class SummarizationModule(BaseTransformer):
+    mode = "summarization"
+    loss_names = ["loss"]
+    metric_names = ROUGE_KEYS
+    default_val_metric = "rouge2"
+    def __init__(self, hparams, **kwargs):
+        if hparams.sortish_sampler and hparams.gpus > 1:
+            hparams.replace_sampler_ddp = False
+        elif hparams.max_tokens_per_batch is not None:
+            if hparams.gpus > 1:
+                raise NotImplementedError("Dynamic Batch size does not work for multi-gpu training")
+            if hparams.sortish_sampler:
+                raise ValueError("--sortish_sampler and --max_tokens_per_batch may not be used simultaneously")
+        super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs)
+        #use_task_specific_params(self.model, "summarization")
+        self.metrics_save_path = Path('base') / "metrics.json"
+        self.hparams_save_path = Path('base') / "hparams.pkl"
+        pickle_save(self.hparams, self.hparams_save_path)
+        self.step_count = -2
+        self.metrics = defaultdict(list)
+        self.model_type = self.config.model_type
+        self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size
+        if 't5' in hparams.model_name_or_path:
+            self.model.config.prefix = 'translate Graph to English: '
+        self.dataset_kwargs: dict = dict(
+            data_dir=self.hparams.data_dir,
+            max_source_length=self.hparams.max_source_length,
+            prefix=self.model.config.prefix or "",
+        )
+        n_observations_per_split = {
+            "train": self.hparams.n_train,
+            "val": self.hparams.n_val,
+            "test_seen": self.hparams.n_test,
+            "test_unseen": self.hparams.n_test,
+            "test_both": self.hparams.n_test,
+        }
+        self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()}
+        self.target_lens = {
+            "train": self.hparams.max_target_length,
+            "val": self.hparams.val_max_target_length,
+            "test_seen": self.hparams.test_max_target_length,
+            "test_unseen": self.hparams.test_max_target_length,
+            "test_both": self.hparams.test_max_target_length,
+        }
+        assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}"
+        assert self.target_lens["train"] <= self.target_lens["test_both"], f"target_lens: {self.target_lens}"
+        if self.hparams.freeze_embeds:
+            freeze_embeds(self.model)
+        if self.hparams.freeze_encoder:
+            freeze_params(self.model.get_encoder())
+            assert_all_frozen(self.model.get_encoder())
+        self.num_workers = hparams.num_workers
+        self.decoder_start_token_id = None  # default to config
+        if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer):
+            self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang]
+            self.model.config.decoder_start_token_id = self.decoder_start_token_id
+        self.dataset_class = (
+            Seq2SeqDataset if hasattr(self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset
+        )
+        self.already_saved_batch = False
+        self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams
+        if self.hparams.eval_max_gen_length is not None:
+            self.eval_max_length = self.hparams.eval_max_gen_length
+        else:
+            self.eval_max_length = self.model.config.max_length
+        self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
+    def save_readable_batch(self, batch: Dict[str, torch.Tensor]) -> Dict[str, List[str]]:
+        """A debugging utility"""
+        readable_batch = {
+            k: self.tokenizer.batch_decode(v.tolist()) if "mask" not in k else v.shape for k, v in batch.items()
+        }
+        save_json(readable_batch, Path(self.output_dir) / "text_batch.json")
+        tb = {}
+        for k, v in batch.items():
+            tb[k] = v.tolist()
+        save_json(tb, Path(self.output_dir) / "tok_batch.json")
+        self.already_saved_batch = True
+        return readable_batch
+    def forward(self, input_ids, **kwargs):
+        return self.model(input_ids, **kwargs)
+    def ids_to_clean_text(self, generated_ids: List[int]):
+        gen_text = self.tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
+        )
+        return lmap(str.strip, gen_text)
+    def _step(self, batch: dict) -> Tuple:
+        pad_token_id = self.tokenizer.pad_token_id
+        src_ids, src_mask = batch["input_ids"], batch["attention_mask"]
+        if isinstance(self.model, T5ForConditionalGeneration):
+            tgt_ids = batch["labels"]
+            decoder_input_ids = self.model._shift_right(tgt_ids)
+        else:
+            #decoder_input_ids = shift_tokens_right(tgt_ids, pad_token_id)
+            y = batch["labels"]
+            decoder_input_ids = y[:, :-1].contiguous()
+            tgt_ids = y[:, 1:].clone()
+        if not self.already_saved_batch:  # This would be slightly better if it only happened on rank zero
+            batch["decoder_input_ids"] = decoder_input_ids
+            self.save_readable_batch(batch)
+        outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
+        lm_logits = outputs[0]
+        if self.hparams.label_smoothing == 0:
+            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
+            ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
+            assert lm_logits.shape[-1] == self.vocab_size
+            loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))
+        else:
+            lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1)
+            loss, nll_loss = label_smoothed_nll_loss(
+                lprobs, tgt_ids, self.hparams.label_smoothing, ignore_index=pad_token_id
+            )
+        return (loss,)
+    @property
+    def pad(self) -> int:
+        return self.tokenizer.pad_token_id
+    def training_step(self, batch, batch_idx) -> Dict:
+        loss_tensors = self._step(batch)
+        logs = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        # tokens per batch
+        logs["tpb"] = batch["input_ids"].ne(self.pad).sum() + batch["labels"].ne(self.pad).sum()
+        logs["bs"] = batch["input_ids"].shape[0]
+        logs["src_pad_tok"] = batch["input_ids"].eq(self.pad).sum()
+        logs["src_pad_frac"] = batch["input_ids"].eq(self.pad).float().mean()
+        # TODO(SS): make a wandb summary metric for this
+        return {"loss": loss_tensors[0], "log": logs}
+    def validation_step(self, batch, batch_idx) -> Dict:
+        return self._generative_step(batch)
+    def validation_epoch_end(self, outputs, prefix="val") -> Dict:
+        self.step_count += 1
+        val_outputs_folder = "val_outputs"
+        os.system("mkdir -p " + os.path.join(self.hparams.output_dir, val_outputs_folder))
+        if prefix == "val":
+            output_test_predictions_file = os.path.join(self.hparams.output_dir, val_outputs_folder, "validation_predictions_" +
+                                                        str(self.step_count) + ".txt")
+            output_test_targets_file = os.path.join(self.hparams.output_dir, val_outputs_folder, "validation_targets_" +
+                                                        str(self.step_count) + ".txt")
+            # write predictions and targets for later rouge evaluation.
+            with open(output_test_predictions_file, "w") as p_writer, open(output_test_targets_file, "w") as t_writer:
+                for output_batch in outputs:
+                    p_writer.writelines(convert_text(s) + "\n" for s in output_batch["preds"])
+                    t_writer.writelines(convert_text(s) + "\n" for s in output_batch["target"])
+                p_writer.close()
+                t_writer.close()
+            bleu_info = eval_bleu(self.hparams.data_dir, output_test_predictions_file, 'val')
+            rank_zero_info("%s bleu_info: %s", self.step_count, bleu_info)
+            if bleu_info == -1:
+                bleu_info = float(bleu_info)
+            else:
+                bleu_info = float(bleu_info.split(",")[0].split("BLEU = ")[1])
+            losses = {k: torch.stack([x[k] for x in outputs]).mean() for k in self.loss_names}
+            loss = losses["loss"]
+            generative_metrics = {
+                k: np.array([x[k] for x in outputs]).mean() for k in self.metric_names + ["gen_time", "gen_len"]
+            }
+            generative_metrics['bleu'] = bleu_info
+            metric_val = (
+                generative_metrics[self.val_metric] if self.val_metric in generative_metrics else losses[
+                    self.val_metric]
+            )
+            metric_tensor: torch.FloatTensor = torch.tensor(metric_val).type_as(loss)
+            generative_metrics.update({k: v.item() for k, v in losses.items()})
+            losses.update(generative_metrics)
+            all_metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
+            all_metrics["step_count"] = self.step_count
+            self.metrics[prefix].append(all_metrics)  # callback writes this to self.metrics_save_path
+            preds = flatten_list([x["preds"] for x in outputs])
+            return {
+                "bleu": bleu_info,
+                "log": all_metrics,
+                "preds": preds,
+                f"{prefix}_loss": loss,
+                f"{prefix}_{self.val_metric}": metric_tensor,
+            }
+        else:
+            data_logs = {}
+            for output in outputs:
+                dataset_idx = output[0]['dataloader_idx']
+                if dataset_idx == 0:
+                    dataset_name = 'test_both'
+                elif dataset_idx == 1:
+                    dataset_name = 'test_seen'
+                else:
+                    dataset_name = 'test_unseen'
+                if output[0]['bleu'] == -1:
+                    bleu_info = float(output[0]['bleu'])
+                else:
+                    bleu_info = float(output[0]['bleu'].split(",")[0].split("BLEU = ")[1])
+                losses = {k: torch.stack([x[k] for x in output]).mean() for k in self.loss_names}
+                loss = losses["loss"]
+                generative_metrics = {
+                    k: np.array([x[k] for x in output]).mean() for k in self.metric_names + ["gen_time", "gen_len"]
+                }
+                generative_metrics['bleu'] = bleu_info
+                metric_val = (
+                    generative_metrics[self.val_metric] if self.val_metric in generative_metrics else losses[
+                        self.val_metric]
+                )
+                metric_tensor: torch.FloatTensor = torch.tensor(metric_val).type_as(loss)
+                generative_metrics.update({k: v.item() for k, v in losses.items()})
+                losses.update(generative_metrics)
+                all_metrics = {f"{prefix}_avg_{k}": x for k, x in losses.items()}
+                all_metrics["step_count"] = self.step_count
+                self.metrics[prefix].append(all_metrics)  # callback writes this to self.metrics_save_path
+                preds = flatten_list([x["preds"] for x in output])
+                data_logs.update({
+                    "log" + "_" + dataset_name: all_metrics,
+                    "preds" + "_" + dataset_name: preds,
+                    f"{prefix}_loss" + "_" + dataset_name: loss,
+                    f"{prefix}_{self.val_metric}" + "_" + dataset_name: metric_tensor,
+                })
+            return data_logs
+        #######
+    def calc_generative_metrics(self, preds, target) -> Dict:
+        return calculate_rouge(preds, target)
+    def _generative_step(self, batch: dict, batch_idx=None, dataloader_idx=None) -> dict:
+        t0 = time.time()
+        # parser.add_argument('--eval_max_gen_length', type=int, default=None, help='never generate more than n tokens')
+        generated_ids = self.model.generate(
+            batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            use_cache=True,
+            decoder_start_token_id=self.decoder_start_token_id,
+            num_beams=self.eval_beams,
+            max_length=self.eval_max_length,
+            length_penalty=1.0
+        )
+        gen_time = (time.time() - t0) / batch["input_ids"].shape[0]
+        preds: List[str] = self.ids_to_clean_text(generated_ids)
+        target: List[str] = self.ids_to_clean_text(batch["labels"])
+        loss_tensors = self._step(batch)
+        base_metrics = {name: loss for name, loss in zip(self.loss_names, loss_tensors)}
+        rouge: Dict = self.calc_generative_metrics(preds, target)
+        summ_len = np.mean(lmap(len, generated_ids))
+        base_metrics.update(gen_time=gen_time, gen_len=summ_len, preds=preds, target=target, **rouge)
+        if dataloader_idx is not None:
+            base_metrics.update(batch_idx=batch_idx, dataloader_idx=dataloader_idx)
+        return base_metrics
+    def test_step(self, batch, batch_idx, dataloader_idx):
+        return self._generative_step(batch, batch_idx, dataloader_idx)
+    def test_epoch_end(self, outputs_all_testsets):
+        val_outputs_folder = "val_outputs"
+        os.system("mkdir -p " + os.path.join(self.hparams.output_dir, val_outputs_folder))
+        for outputs in outputs_all_testsets:
+            dataset_idx = outputs[0]['dataloader_idx']
+            if dataset_idx == 0:
+                file_name = "test_both_predictions.txt"
+                file_name_tgt = "test_both_targets.txt"
+                dataset_name = 'test_both'
+            elif dataset_idx == 1:
+                file_name = "test_seen_predictions.txt"
+                file_name_tgt = "test_seen_targets.txt"
+                dataset_name = 'test_seen'
+            else:
+                file_name = "test_unseen_predictions.txt"
+                file_name_tgt = "test_unseen_targets.txt"
+                dataset_name = 'test_unseen'
+            file_name += '.debug'
+            file_name_tgt += '.debug'
+            output_test_predictions_file = os.path.join(self.hparams.output_dir, val_outputs_folder, file_name)
+            output_test_targets_file = os.path.join(self.hparams.output_dir, val_outputs_folder, file_name_tgt)
+            # write predictions and targets for later rouge evaluation.
+            with open(output_test_predictions_file, "w") as p_writer, open(output_test_targets_file, "w") as t_writer:
+                for output_batch in outputs:
+                    p_writer.writelines(convert_text(s) + "\n" for s in output_batch["preds"])
+                    t_writer.writelines(convert_text(s) + "\n" for s in output_batch["target"])
+                p_writer.close()
+                t_writer.close()
+            bleu_info = eval_bleu(self.hparams.data_dir, output_test_predictions_file, dataset_name)
+            meteor_info = eval_meteor_test_webnlg(self.hparams.data_dir, output_test_predictions_file, dataset_name)
+            chrf_info = eval_chrf_test_webnlg(self.hparams.data_dir, output_test_predictions_file, dataset_name)
+            rank_zero_info(" %s - bleu_info: %s", dataset_name, bleu_info)
+            rank_zero_info(" %s - meteor_info: %s", dataset_name, meteor_info)
+            rank_zero_info(" %s - chrf_info: %s", dataset_name, chrf_info)
+            outputs[0]['bleu'] = bleu_info
+        return self.validation_epoch_end(outputs_all_testsets, prefix="test")
+    def get_dataset(self, type_path) -> Seq2SeqDataset:
+        n_obs = self.n_obs[type_path]
+        max_target_length = self.target_lens[type_path]
+        dataset = self.dataset_class(
+            self.tokenizer,
+            type_path=type_path,
+            n_obs=n_obs,
+            max_target_length=max_target_length,
+            **self.dataset_kwargs,
+        )
+        return dataset
+    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False) -> DataLoader:
+        dataset = self.get_dataset(type_path)
+        if self.hparams.sortish_sampler and type_path != "test":
+            sampler = dataset.make_sortish_sampler(batch_size, distributed=self.hparams.gpus > 1)
+            return DataLoader(
+                dataset,
+                batch_size=batch_size,
+                collate_fn=dataset.collate_fn,
+                shuffle=False,
+                num_workers=self.num_workers,
+                sampler=sampler,
+            )
+        elif self.hparams.max_tokens_per_batch is not None and type_path != "test":
+            batch_sampler = dataset.make_dynamic_sampler(
+                self.hparams.max_tokens_per_batch, distributed=self.hparams.gpus > 1
+            )
+            return DataLoader(
+                dataset,
+                batch_sampler=batch_sampler,
+                collate_fn=dataset.collate_fn,
+                # shuffle=False,
+                num_workers=self.num_workers,
+                # batch_size=None,
+            )
+        else:
+            return DataLoader(
+                dataset,
+                batch_size=batch_size,
+                collate_fn=dataset.collate_fn,
+                shuffle=shuffle,
+                num_workers=self.num_workers,
+                sampler=None,
+            )
+    def train_dataloader(self) -> DataLoader:
+        dataloader = self.get_dataloader("train", batch_size=self.hparams.train_batch_size, shuffle=True)
+        return dataloader
+    def val_dataloader(self) -> DataLoader:
+        return self.get_dataloader("val", batch_size=self.hparams.eval_batch_size)
+    def test_dataloader(self) -> List[DataLoader]:
+        test_dataloader = self.get_dataloader("test_both", batch_size=self.hparams.eval_batch_size)
+        test_seen_dataloader = self.get_dataloader("test_seen", batch_size=self.hparams.eval_batch_size)
+        test_unseen_dataloader = self.get_dataloader("test_unseen", batch_size=self.hparams.eval_batch_size)
+        return [test_dataloader, test_seen_dataloader, test_unseen_dataloader]
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        BaseTransformer.add_model_specific_args(parser, root_dir)
+        add_generic_args(parser, root_dir)
+        parser.add_argument(
+            "--max_source_length",
+            default=1024,
+            type=int,
+            help="The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded.",
+        )
+        parser.add_argument(
+            "--max_target_length",
+            default=56,
+            type=int,
+            help="The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded.",
+        )
+        parser.add_argument(
+            "--val_max_target_length",
+            default=142,  # these defaults are optimized for CNNDM. For xsum, see README.md.
+            type=int,
+            help="The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded.",
+        )
+        parser.add_argument(
+            "--test_max_target_length",
+            default=142,
+            type=int,
+            help="The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded.",
+        )
+        parser.add_argument("--freeze_encoder", action="store_true")
+        parser.add_argument("--freeze_embeds", action="store_true")
+        parser.add_argument("--sortish_sampler", action="store_true", default=False)
+        parser.add_argument("--max_tokens_per_batch", type=int, default=None)
+        parser.add_argument("--logger_name", type=str, choices=["default", "wandb", "wandb_shared"], default="default")
+        parser.add_argument("--n_train", type=int, default=-1, required=False, help="# examples. -1 means use all.")
+        parser.add_argument("--n_val", type=int, default=-1, required=False, help="# examples. -1 means use all.")
+        parser.add_argument("--n_test", type=int, default=-1, required=False, help="# examples. -1 means use all.")
+        parser.add_argument(
+            "--task", type=str, default="summarization", required=False, help="# examples. -1 means use all."
+        )
+        parser.add_argument("--label_smoothing", type=float, default=0.0, required=False)
+        parser.add_argument("--src_lang", type=str, default="", required=False)
+        parser.add_argument("--tgt_lang", type=str, default="", required=False)
+        parser.add_argument("--eval_beams", type=int, default=None, required=False)
+        parser.add_argument("--checkpoint", type=str, default=None, required=False)
+        parser.add_argument(
+            "--val_metric", type=str, default=None, required=False, choices=["bleu", "rouge2", "loss", None]
+        )
+        parser.add_argument("--eval_max_gen_length", type=int, default=None, help="never generate more than n tokens")
+        parser.add_argument("--save_top_k", type=int, default=1, required=False, help="How many checkpoints to save")
+        parser.add_argument(
+            "--early_stopping_patience",
+            type=int,
+            default=-1,
+            required=False,
+            help="-1 means never early stop. early_stopping_patience is measured in validation checks, not epochs. So val_check_interval will effect it.",
+        )
+        return parser
+class TranslationModule(SummarizationModule):
+    mode = "translation"
+    loss_names = ["loss"]
+    metric_names = ["bleu"]
+    default_val_metric = "bleu"
+    def __init__(self, hparams, **kwargs):
+        super().__init__(hparams, **kwargs)
+        self.dataset_kwargs["src_lang"] = hparams.src_lang
+        self.dataset_kwargs["tgt_lang"] = hparams.tgt_lang
+    def calc_generative_metrics(self, preds, target) -> dict:
+        return calculate_bleu(preds, target)
+class Graph2TextModule(SummarizationModule):
+    mode = "graph2text"
+    loss_names = ["loss"]
+    metric_names = ["sacrebleu"]
+    default_val_metric = "bleu"
+    def __init__(self, hparams, **kwargs):
+        if type(hparams) == dict:
+            hparams = argparse.Namespace(**hparams)
+        print(f'Graph2Text hparams are: {hparams}')
+        super().__init__(hparams, **kwargs)
+        self.hparams.update(vars(hparams))
+        rank_zero_info("parameters %s", hparams)
+    def calc_generative_metrics(self, preds, target) -> dict:
+        return calculate_bleu(preds, target)
+def main(args, model=None) -> SummarizationModule:
+    Path(args.output_dir).mkdir(exist_ok=True)
+    if len(os.listdir(args.output_dir)) > 3 and args.do_train:
+        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+    if model is None:
+        if "summarization" in args.task:
+            model: SummarizationModule = SummarizationModule(args)
+        elif "translation" in args.task:
+            model: SummarizationModule = TranslationModule(args)
+        else:
+            model: SummarizationModule = Graph2TextModule(args)
+    dataset = Path(args.data_dir).name
+    if (
+        args.logger_name == "default"
+        or args.fast_dev_run
+        or str(args.output_dir).startswith("/tmp")
+        or str(args.output_dir).startswith("/var")
+    ):
+        logger = True  # don't pollute wandb logs unnecessarily
+    elif args.logger_name == "wandb":
+        from pytorch_lightning.loggers import WandbLogger
+        project = os.environ.get("WANDB_PROJECT", dataset)
+        logger = WandbLogger(name=model.output_dir.name, project=project)
+    elif args.logger_name == "wandb_shared":
+        from pytorch_lightning.loggers import WandbLogger
+        logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")
+    if args.early_stopping_patience >= 0:
+        es_callback = get_early_stopping_callback(model.val_metric, args.early_stopping_patience)
+    else:
+        es_callback = False
+    lower_is_better = args.val_metric == "loss"
+    trainer: pl.Trainer = generic_train(
+        model,
+        args,
+        logging_callback=Seq2SeqLoggingCallback(),
+        checkpoint_callback=get_checkpoint_callback(
+            args.output_dir, model.val_metric, args.save_top_k, lower_is_better
+        ),
+        early_stopping_callback=es_callback,
+        logger=logger,
+    )
+    pickle_save(model.hparams, model.output_dir / "hparams.pkl")
+    if not args.do_predict:
+        return model
+    model.hparams.test_checkpoint = ""
+    if not args.checkpoint:
+        checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "*.ckpt"), recursive=True)))
+    else:
+        checkpoints = [args.checkpoint]
+    if checkpoints:
+        model.hparams.test_checkpoint = checkpoints[-1]
+        trainer.resume_from_checkpoint = checkpoints[-1]
+        if args.do_predict and not args.do_train:
+            checkpoint = checkpoints[-1]
+            print(checkpoint)
+            #trainer.test(ckpt_path=checkpoints[-1])
+            trainer.test(model, ckpt_path=checkpoint)
+            return model
+    trainer.logger.log_hyperparams(model.hparams)
+    # test() without a model tests using the best checkpoint automatically
+    trainer.test()
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser = pl.Trainer.add_argparse_args(parser)
+    parser = SummarizationModule.add_model_specific_args(parser, os.getcwd())
+    args = parser.parse_args()
+    main(args)

utils/lightning_base.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict
+import sys
+import pytorch_lightning as pl
+from pytorch_lightning.utilities import rank_zero_info
+from pytorch_lightning.callbacks import LearningRateMonitor
+from transformers import (
+    AdamW,
+    AutoConfig,
+    AutoModel,
+    AutoModelForPreTraining,
+    AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+)
+from transformers.optimization import (
+    Adafactor,
+    get_cosine_schedule_with_warmup,
+    get_cosine_with_hard_restarts_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+    get_polynomial_decay_schedule_with_warmup,
+)
+from tokenizers import AddedToken
+logger = logging.getLogger(__name__)
+MODEL_MODES = {
+    "base": AutoModel,
+    "sequence-classification": AutoModelForSequenceClassification,
+    "question-answering": AutoModelForQuestionAnswering,
+    "pretraining": AutoModelForPreTraining,
+    "token-classification": AutoModelForTokenClassification,
+    "language-modeling": AutoModelWithLMHead,
+    "summarization": AutoModelForSeq2SeqLM,
+    "translation": AutoModelForSeq2SeqLM,
+    "graph2text": AutoModelForSeq2SeqLM,
+}
+# update this and the import above to support new schedulers from transformers.optimization
+arg_to_scheduler = {
+    "linear": get_linear_schedule_with_warmup,
+    "cosine": get_cosine_schedule_with_warmup,
+    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
+    "polynomial": get_polynomial_decay_schedule_with_warmup,
+    # '': get_constant_schedule,             # not supported for now
+    # '': get_constant_schedule_with_warmup, # not supported for now
+}
+arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
+arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
+class BaseTransformer(pl.LightningModule):
+    def __init__(
+        self,
+        hparams: argparse.Namespace,
+        num_labels=None,
+        mode="base",
+        config=None,
+        tokenizer=None,
+        model=None,
+        **config_kwargs
+    ):
+        """Initialize a model, tokenizer and config."""
+        super().__init__()
+        # TODO: move to self.save_hyperparameters()
+        # self.save_hyperparameters()
+        # can also expand arguments into trainer signature for easier reading
+        self.save_hyperparameters(hparams)
+        self.step_count = -2
+        self.output_dir = Path(self.hparams.output_dir)
+        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
+        if config is None:
+            self.config = AutoConfig.from_pretrained(
+                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
+                **({"num_labels": num_labels} if num_labels is not None else {}),
+                cache_dir=cache_dir,
+                **config_kwargs,
+            )
+        else:
+            self.config: PretrainedConfig = config
+        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
+        for p in extra_model_params:
+            if getattr(self.hparams, p, None):
+                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
+                setattr(self.config, p, getattr(self.hparams, p))
+        if tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
+                cache_dir=cache_dir,
+            )
+            new_tokens = [
+                '<H>','<R>','<T>'
+            ]
+            new_tokens_vocab = {}
+            new_tokens_vocab['additional_special_tokens'] = []
+            for idx, t in enumerate(new_tokens):
+                new_tokens_vocab['additional_special_tokens'].append(t)
+            num_added_toks = self.tokenizer.add_special_tokens(new_tokens_vocab)
+            rank_zero_info('We have added %s tokens', num_added_toks)
+        else:
+            self.tokenizer: PreTrainedTokenizer = tokenizer
+        self.model_type = MODEL_MODES[mode]
+        if model is None:
+            self.model = self.model_type.from_pretrained(
+                self.hparams.model_name_or_path,
+                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
+                config=self.config,
+                cache_dir=cache_dir,
+            )
+            self.model.resize_token_embeddings(len(self.tokenizer))
+        else:
+            self.model = model
+    def load_hf_checkpoint(self, *args, **kwargs):
+        self.model = self.model_type.from_pretrained(*args, **kwargs)
+    def get_lr_scheduler(self):
+        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
+        scheduler = get_schedule_func(
+            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps
+        )
+        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
+        return scheduler
+    def configure_optimizers(self):
+        """Prepare optimizer and schedule (linear warmup and decay)"""
+        model = self.model
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": self.hparams.weight_decay,
+            },
+            {
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        if self.hparams.adafactor:
+            optimizer = Adafactor(
+                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
+            )
+        else:
+            optimizer = AdamW(
+                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
+            )
+        self.opt = optimizer
+        scheduler = self.get_lr_scheduler()
+        return [optimizer], [scheduler]
+    def test_step(self, batch, batch_nb):
+        return self.validation_step(batch, batch_nb)
+    def test_epoch_end(self, outputs):
+        return self.validation_end(outputs)
+    @property
+    def total_steps(self) -> int:
+        # print('self.hparams.gpus', self.hparams.gpus)
+        # print('self.hparams.accumulate_grad_batches', self.hparams.accumulate_grad_batches)
+        # print('self.train_loader.dataset', self.train_loader.dataset)
+        # print('self.hparams.max_epochs', self.hparams.max_epochs)
+        # print('self.hparams.train_batch_size', self.hparams.train_batch_size)
+        # exit()
+        """The number of total training steps that will be run. Used for lr scheduler purposes."""
+        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
+        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
+        dataset_size = len(self.train_loader.dataset)
+        return (dataset_size / effective_batch_size) * self.hparams.max_epochs
+    def setup(self, mode):
+        #if mode == "fit":
+        self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
+    def get_dataloader(self, type_path, batch_size, shuffle=False):
+        raise NotImplementedError("You must implement this for your task")
+    def train_dataloader(self):
+        return self.train_loader
+    def val_dataloader(self):
+        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
+    def test_dataloader(self):
+        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
+    def _feature_file(self, mode):
+        return os.path.join(
+            self.hparams.data_dir,
+            "cached_{}_{}_{}".format(
+                mode,
+                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
+                str(self.hparams.max_seq_length),
+            ),
+        )
+    def get_progress_bar_dict(self):
+        #metrics = self.trainer.callback_metrics
+        #print(self.trainer.lr_logger.lrs)
+        lrs = self.trainer.lr_logger.lrs['lr-AdamW/pg1'][-1]
+        running_train_loss = self.trainer.running_loss.mean()
+        avg_training_loss = running_train_loss.cpu().item() if running_train_loss is not None else float('NaN')
+        tqdm_dict = {"loss": "{:.3f}".format(avg_training_loss), "lr": lrs}
+        return tqdm_dict
+    @pl.utilities.rank_zero_only
+    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
+        save_path = self.output_dir.joinpath("best_tfmr")
+        self.model.config.save_step = self.step_count
+        self.model.save_pretrained(save_path)
+        self.tokenizer.save_pretrained(save_path)
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        parser.add_argument(
+            "--model_name_or_path",
+            default=None,
+            type=str,
+            required=True,
+            help="Path to pretrained model or model identifier from huggingface.co/models",
+        )
+        parser.add_argument(
+            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+        )
+        parser.add_argument(
+            "--tokenizer_name",
+            default=None,
+            type=str,
+            help="Pretrained tokenizer name or path if not the same as model_name",
+        )
+        parser.add_argument(
+            "--cache_dir",
+            default="",
+            type=str,
+            help="Where do you want to store the pre-trained models downloaded from s3",
+        )
+        parser.add_argument(
+            "--encoder_layerdrop",
+            type=float,
+            help="Encoder layer dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--decoder_layerdrop",
+            type=float,
+            help="Decoder layer dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--dropout",
+            type=float,
+            help="Dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--attention_dropout",
+            type=float,
+            help="Attention dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+        parser.add_argument(
+            "--lr_scheduler",
+            default="linear",
+            choices=arg_to_scheduler_choices,
+            metavar=arg_to_scheduler_metavar,
+            type=str,
+            help="Learning rate scheduler",
+        )
+        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
+        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
+        parser.add_argument("--train_batch_size", default=32, type=int)
+        parser.add_argument("--eval_batch_size", default=32, type=int)
+        parser.add_argument("--adafactor", action="store_true")
+class LoggingCallback(pl.Callback):
+    def on_batch_end(self, trainer, pl_module):
+        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
+        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
+        pl_module.logger.log_metrics(lrs)
+    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        rank_zero_info("***** Validation results *****")
+        metrics = trainer.callback_metrics
+        rank_zero_info(trainer.logger)
+        # Log results
+        for key in sorted(metrics):
+            if key not in ["log", "progress_bar"]:
+                rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
+    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        rank_zero_info("***** Test results *****")
+        metrics = trainer.callback_metrics
+        # Log and save results to file
+        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
+        with open(output_test_results_file, "w") as writer:
+            for key in sorted(metrics):
+                if key not in ["log", "progress_bar"]:
+                    rank_zero_info("{} = {}\n".format(key, str(metrics[key])))
+                    writer.write("{} = {}\n".format(key, str(metrics[key])))
+def add_generic_args(parser, root_dir) -> None:
+    #  TODO(SS): allow all pl args? parser = pl.Trainer.add_argparse_args(parser)
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O2",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
+    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        dest="accumulate_grad_batches",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
+    )
+def generic_train(
+    model: BaseTransformer,
+    args: argparse.Namespace,
+    early_stopping_callback=False,
+    logger=True,  # can pass WandbLogger() here
+    extra_callbacks=[],
+    checkpoint_callback=None,
+    logging_callback=None,
+    **extra_train_kwargs
+):
+    pl.seed_everything(args.seed)
+    # init model
+    odir = Path(model.hparams.output_dir)
+    odir.mkdir(exist_ok=True)
+    # add custom checkpoints
+    if checkpoint_callback is None:
+        checkpoint_callback = pl.callbacks.ModelCheckpoint(
+            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
+        )
+    if logging_callback is None:
+        logging_callback = LoggingCallback()
+    train_params = {}
+    # TODO: remove with PyTorch 1.6 since pl uses native amp
+    if args.fp16:
+        train_params["precision"] = 16
+        train_params["amp_level"] = args.fp16_opt_level
+    if args.gpus > 1:
+        train_params["distributed_backend"] = "ddp"
+    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
+    lr_logger = LearningRateMonitor(logging_interval='step')
+    #         deterministic=True,
+    trainer = pl.Trainer.from_argparse_args(
+        args,
+        weights_summary='full',
+        callbacks=[logging_callback, lr_logger],
+        logger=logger,
+        checkpoint_callback=checkpoint_callback,
+        early_stop_callback=early_stopping_callback,
+        num_sanity_val_steps=4,
+        **train_params,
+    )
+    trainer.lr_logger = lr_logger
+    if args.do_train:
+        trainer.fit(model)
+    return trainer

utils/sentence_retrieval_model.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+import torch.nn as nn
+from utils.bert_model import BertForSequenceEncoder
+class sentence_retrieval_model(nn.Module):
+    def __init__(self, args):
+        super(sentence_retrieval_model, self).__init__()
+        self.pred_model = BertForSequenceEncoder.from_pretrained(args['bert_pretrain'])
+        self.bert_hidden_dim = args['bert_hidden_dim']
+        self.dropout = nn.Dropout(args['dropout'])
+        self.proj_match = nn.Linear(self.bert_hidden_dim, 1)
+    def forward(self, inp_tensor, msk_tensor, seg_tensor):
+        _, inputs = self.pred_model(inp_tensor, msk_tensor, seg_tensor)
+        inputs = self.dropout(inputs)
+        score = self.proj_match(inputs).squeeze(-1)
+        score = torch.tanh(score)
+        return score

utils/sentence_retrieval_module.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import re
+from typing import List, Tuple
+import pathlib
+import torch
+from transformers import BertTokenizer
+from utils.sentence_retrieval_model import sentence_retrieval_model
+THIS_DIR = pathlib.Path(__file__).parent.absolute()
+ARGS = {
+    'batch_size': 32,
+    'bert_pretrain': 'base/bert_base',
+    'checkpoint': 'base/model.best.32.pt',
+    'dropout': 0.6,
+    'bert_hidden_dim': 768,
+    'max_len': 384,
+    'cuda': torch.cuda.is_available()
+}
+if not ARGS['cuda']:
+    print('CUDA NOT AVAILABLE')
+def process_sent(sentence):
+    sentence = re.sub("LSB.*?RSB", "", sentence)
+    sentence = re.sub("LRB\s*?RRB", "", sentence)
+    sentence = re.sub("(\s*?)LRB((\s*?))", "\\1(\\2", sentence)
+    sentence = re.sub("(\s*?)RRB((\s*?))", "\\1)\\2", sentence)
+    sentence = re.sub("--", "-", sentence)
+    sentence = re.sub("``", '"', sentence)
+    sentence = re.sub("''", '"', sentence)
+    return sentence
+class SentenceRetrievalModule():
+    def __init__(self, max_len=None):
+        if max_len:
+            ARGS['max_len'] = max_len
+        self.tokenizer = BertTokenizer.from_pretrained(ARGS['bert_pretrain'], do_lower_case=False)
+        self.model = sentence_retrieval_model(ARGS)
+        self.model.load_state_dict(torch.load(ARGS['checkpoint'], map_location=torch.device('cpu'))['model'])
+        if ARGS['cuda']:
+            self.model = self.model.cuda()
+    def score_sentence_pairs(self, inputs: List[Tuple[str]]):
+        inputs_processed = [(process_sent(input[0]), process_sent(input[1])) for input in inputs]
+        encodings =  self.tokenizer(
+            inputs_processed,
+            padding='max_length',
+            truncation='longest_first',
+            max_length=ARGS['max_len'],
+            return_token_type_ids=True,
+            return_attention_mask=True,
+            return_tensors='pt',
+        )
+        inp = encodings['input_ids']
+        msk = encodings['attention_mask']
+        seg = encodings['token_type_ids']
+        if ARGS['cuda']:
+            inp = inp.cuda()
+            msk = msk.cuda()
+            seg = seg.cuda()
+        self.model.eval()
+        with torch.no_grad():
+            outputs = self.model(inp, msk, seg).tolist()
+        assert len(outputs) == len(inputs)
+        return outputs

utils/textual_entailment_module.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import json
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import torch
+import re
+from transformers import BertTokenizer, BertForSequenceClassification
+# Constants and paths
+HOME = Path('/users/k2031554')
+DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+MAX_LEN = 512
+CLASSES = ['SUPPORTS','REFUTES','NOT ENOUGH INFO']
+METHODS = ['WEIGHTED_SUM', 'MALON']
+def process_sent(sentence):
+    sentence = re.sub("LSB.*?RSB", "", sentence)
+    sentence = re.sub("LRB\s*?RRB", "", sentence)
+    sentence = re.sub("(\s*?)LRB((\s*?))", "\\1(\\2", sentence)
+    sentence = re.sub("(\s*?)RRB((\s*?))", "\\1)\\2", sentence)
+    sentence = re.sub("--", "-", sentence)
+    sentence = re.sub("``", '"', sentence)
+    sentence = re.sub("''", '"', sentence)
+    return sentence
+class TextualEntailmentModule():
+    def __init__(
+        self,
+        model_path = 'base/models/BERT_FEVER_v4_model_PBT',
+        tokenizer_path = 'base/models/BERT_FEVER_v4_tok_PBT'
+        ):
+        self.tokenizer = BertTokenizer.from_pretrained(
+            tokenizer_path
+        )
+        self.model = BertForSequenceClassification.from_pretrained(
+            model_path
+        )
+        self.model.to(DEVICE)
+    #def get_pair_scores(self, claim, evidence):
+    #
+    #    encodings = self.tokenizer(
+    #        [claim, evidence],
+    #        max_length= MAX_LEN,
+    #        return_token_type_ids=False,
+    #        padding='max_length',
+    #        truncation=True,
+    #        return_tensors='pt',
+    #    ).to(DEVICE)
+    #
+    #    self.model.eval()
+    #    with torch.no_grad():
+    #        probs = self.model(
+    #            input_ids=encodings['input_ids'],
+    #            attention_mask=encodings['attention_mask']
+    #        )
+    #
+    #    return torch.softmax(probs.logits,dim=1).cpu().numpy()
+    def get_batch_scores(self, claims, evidence):
+        inputs = list(zip(claims, evidence))
+        encodings = self.tokenizer(
+            inputs,
+            max_length= MAX_LEN,
+            return_token_type_ids=False,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt',
+        ).to(DEVICE)
+        self.model.eval()
+        with torch.no_grad():
+            probs = self.model(
+                input_ids=encodings['input_ids'],
+                attention_mask=encodings['attention_mask']
+            )
+        return torch.softmax(probs.logits,dim=1).cpu().numpy()
+    def get_label_from_scores(self, scores):
+        return CLASSES[np.argmax(scores)]
+    def get_label_malon(self, score_set):
+        score_labels = [np.argmax(s) for s in score_set]
+        if 1 not in score_labels and 0 not in score_labels:
+            return CLASSES[2] #NOT ENOUGH INFO
+        elif 0 in score_labels:
+            return CLASSES[0] #SUPPORTS
+        elif 1 in score_labels:
+            return CLASSES[1] #REFUTES

utils/utils_graph2text.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import re
+import os
+def convert_text(text):
+    #return text
+    text = text.lower()
+    text = ' '.join(re.split('(\W)', text))
+    text = ' '.join(text.split())
+    return text
+def eval_meteor_test_webnlg(folder_data, pred_file, dataset):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    folder_data_before = dir_path + "/../utils"
+    cmd_string = "java -jar " + folder_data_before + "/meteor-1.5.jar " + pred_file + " " \
+                  + folder_data + "/" + dataset + ".target_eval_meteor -l en -norm -r 3 > " + pred_file.replace("txt", "meteor")
+    os.system(cmd_string)
+    meteor_info = open(pred_file.replace("txt", "meteor"), 'r').readlines()[-1].strip()
+    return meteor_info
+def eval_chrf_test_webnlg(folder_data, pred_file, dataset):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    folder_data_before = dir_path + "/../utils"
+    cmd_string = "python " + folder_data_before + "/chrf++.py -H " + pred_file + " -R " \
+                  + folder_data + "/" + dataset + ".target_eval_crf > " + pred_file.replace("txt", "chrf")
+    os.system(cmd_string)
+    chrf_info_1 = open(pred_file.replace("txt", "chrf"), 'r').readlines()[1].strip()
+    chrf_info_2 = open(pred_file.replace("txt", "chrf"), 'r').readlines()[2].strip()
+    return chrf_info_1 + " " + chrf_info_2
+def eval_bleu(folder_data, pred_file, dataset):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    folder_data_before = dir_path + "/data/"
+    cmd_string = "perl " + folder_data_before + "/multi-bleu.perl -lc " + folder_data + "/" + dataset + ".target_eval " \
+                  + folder_data + "/" + dataset + ".target2_eval " + folder_data + "/" + dataset + ".target3_eval < " \
+                  + pred_file + " > " + pred_file.replace("txt", "bleu")
+    os.system(cmd_string)
+    try:
+        bleu_info = open(pred_file.replace("txt", "bleu"), 'r').readlines()[0].strip()
+    except:
+        bleu_info = -1
+    return bleu_info
+def eval_bleu_sents_tok(pred_file, folder_data, dataset):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    folder_data_before = dir_path + "/../utils"
+    cmd_string = "perl " + folder_data_before + "/tokenizer.perl -threads 4 -no-escape < " + pred_file + " > " +\
+                 pred_file + "_tok"
+    os.system(cmd_string)
+    cmd_string = "perl " + folder_data_before + "/multi-bleu.perl -lc " + folder_data + "/" + dataset + ".target.tok"\
+                 + " < " + pred_file + "_tok" + " > " + pred_file.replace("txt", "bleu_data")
+    os.system(cmd_string)
+    try:
+        bleu_info_data = open(pred_file.replace("txt", "bleu_data"), 'r').readlines()[0].strip()
+    except:
+        bleu_info_data = 'no data'
+    return bleu_info_data
+def eval_meteor(ref_file, pred_file):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    folder_data_before = dir_path + "/../utils"
+    cmd_string = "java -jar " + folder_data_before + "/meteor-1.5.jar " + pred_file + " " \
+                  + ref_file + " > " + pred_file.replace("txt", "meteor")
+    os.system(cmd_string)
+    meteor_info = open(pred_file.replace("txt", "meteor"), 'r').readlines()[-1].strip()
+    return meteor_info
+def eval_chrf(ref_file, pred_file):
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    folder_data_before = dir_path + "/../utils"
+    cmd_string = "python " + folder_data_before + "/chrf++.py -H " + pred_file + " -R " \
+                  + ref_file + " > " + pred_file.replace("txt", "chrf")
+    os.system(cmd_string)
+    try:
+        chrf_info_1 = open(pred_file.replace("txt", "chrf"), 'r').readlines()[1].strip()
+        chrf_info_2 = open(pred_file.replace("txt", "chrf"), 'r').readlines()[2].strip()
+        chrf_data = chrf_info_1 + " " + chrf_info_2
+    except:
+        chrf_data = "no data"
+    return chrf_data

utils/utils_verbalisation_module.py ADDED Viewed

	@@ -0,0 +1,610 @@

+import itertools
+import json
+import linecache
+import math
+import os
+import pickle
+import socket
+from logging import getLogger
+from pathlib import Path
+from typing import Callable, Dict, Iterable, List, Tuple, Union
+import numpy as np
+import torch
+import torch.distributed as dist
+from rouge_score import rouge_scorer, scoring
+from sacrebleu import corpus_bleu
+from torch import nn
+from torch.utils.data import Dataset, Sampler
+from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
+from transformers.file_utils import cached_property
+from transformers.models.bart.modeling_bart import shift_tokens_right
+from utils.utils_graph2text import convert_text, eval_bleu
+from pytorch_lightning.utilities import rank_zero_info
+import pdb
+try:
+    from fairseq.data.data_utils import batch_by_size
+    FAIRSEQ_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    FAIRSEQ_AVAILABLE = False
+def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
+    """From fairseq"""
+    if target.dim() == lprobs.dim() - 1:
+        target = target.unsqueeze(-1)
+    nll_loss = -lprobs.gather(dim=-1, index=target)
+    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+    if ignore_index is not None:
+        pad_mask = target.eq(ignore_index)
+        nll_loss.masked_fill_(pad_mask, 0.0)
+        smooth_loss.masked_fill_(pad_mask, 0.0)
+    else:
+        nll_loss = nll_loss.squeeze(-1)
+        smooth_loss = smooth_loss.squeeze(-1)
+    nll_loss = nll_loss.sum()  # mean()? Scared to break other math.
+    smooth_loss = smooth_loss.sum()
+    eps_i = epsilon / lprobs.size(-1)
+    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
+    return loss, nll_loss
+def lmap(f: Callable, x: Iterable) -> List:
+    """list(map(f, x))"""
+    return list(map(f, x))
+def calculate_bleu(output_lns, refs_lns) -> dict:
+    """Uses sacrebleu's corpus_bleu implementation."""
+    return {"sacrebleu": round(corpus_bleu(output_lns, [refs_lns]).score, 4)}
+def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], Dict]:
+    def non_pad_len(tokens: np.ndarray) -> int:
+        return np.count_nonzero(tokens != tokenizer.pad_token_id)
+    def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]:
+        pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True)
+        label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True)
+        pred_str = lmap(str.strip, pred_str)
+        label_str = lmap(str.strip, label_str)
+        return pred_str, label_str
+    def summarization_metrics(pred: EvalPrediction) -> Dict:
+        pred_str, label_str = decode_pred(pred)
+        rouge: Dict = calculate_rouge(pred_str, label_str)
+        summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
+        rouge.update({"gen_len": summ_len})
+        return rouge
+    def translation_metrics(pred: EvalPrediction) -> Dict:
+        pred_str, label_str = decode_pred(pred)
+        bleu: Dict = calculate_bleu(pred_str, label_str)
+        gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
+        bleu.update({"gen_len": gen_len})
+        return bleu
+    compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics
+    return compute_metrics_fn
+def trim_batch(
+    input_ids,
+    pad_token_id,
+    attention_mask=None,
+):
+    """Remove columns that are populated exclusively by pad_token_id"""
+    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
+    if attention_mask is None:
+        return input_ids[:, keep_column_mask]
+    else:
+        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
+class AbstractSeq2SeqDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer,
+        data_dir,
+        max_source_length,
+        max_target_length,
+        type_path="train",
+        n_obs=None,
+        prefix="",
+        **dataset_kwargs
+    ):
+        super().__init__()
+        self.src_file = Path(data_dir).joinpath(type_path + ".source")
+        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
+        self.len_file = Path(data_dir).joinpath(type_path + ".len")
+        if os.path.exists(self.len_file):
+            self.src_lens = pickle_load(self.len_file)
+            self.used_char_len = False
+        else:
+            self.src_lens = self.get_char_lens(self.src_file)
+            self.used_char_len = True
+        self.max_source_length = max_source_length
+        self.max_target_length = max_target_length
+        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
+        self.tokenizer = tokenizer
+        self.prefix = prefix if prefix is not None else ""
+        if n_obs is not None:
+            self.src_lens = self.src_lens[:n_obs]
+        self.pad_token_id = self.tokenizer.pad_token_id
+        self.dataset_kwargs = dataset_kwargs
+        dataset_kwargs.update({"add_prefix_space": True} if isinstance(self.tokenizer, BartTokenizer) else {})
+    def __len__(self):
+        return len(self.src_lens)
+    @staticmethod
+    def get_char_lens(data_file):
+        return [len(x) for x in Path(data_file).open().readlines()]
+    @cached_property
+    def tgt_lens(self):
+        """Length in characters of target documents"""
+        return self.get_char_lens(self.tgt_file)
+    def make_sortish_sampler(self, batch_size, distributed=False, shuffle=True, **kwargs):
+        if distributed:
+            return DistributedSortishSampler(self, batch_size, shuffle=shuffle, **kwargs)
+        else:
+            return SortishSampler(self.src_lens, batch_size, shuffle=shuffle)
+    def make_dynamic_sampler(self, max_tokens_per_batch=1024, **kwargs):
+        assert FAIRSEQ_AVAILABLE, "Dynamic batch size requires `pip install fairseq`"
+        assert not self.used_char_len, "You must call  python make_len_file.py before calling make_dynamic_sampler"
+        sorted_indices = list(self.make_sortish_sampler(1024, shuffle=False))
+        def num_tokens_in_example(i):
+            return min(self.src_lens[i], self.max_target_length)
+        # call fairseq cython function
+        batch_sampler: List[List[int]] = batch_by_size(
+            sorted_indices,
+            num_tokens_fn=num_tokens_in_example,
+            max_tokens=max_tokens_per_batch,
+            required_batch_size_multiple=64,
+        )
+        shuffled_batches = [batch_sampler[i] for i in np.random.permutation(range(len(batch_sampler)))]
+        # move the largest batch to the front to OOM quickly (uses an approximation for padding)
+        approximate_toks_per_batch = [max(self.src_lens[i] for i in batch) * len(batch) for batch in shuffled_batches]
+        largest_batch_idx = np.argmax(approximate_toks_per_batch)
+        shuffled_batches[0], shuffled_batches[largest_batch_idx] = (
+            shuffled_batches[largest_batch_idx],
+            shuffled_batches[0],
+        )
+        return shuffled_batches
+    def __getitem__(self, item):
+        raise NotImplementedError("You must implement this")
+    def collate_fn(self, batch):
+        raise NotImplementedError("You must implement this")
+class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
+    def __getitem__(self, index) -> Dict[str, torch.Tensor]:
+        """Call tokenizer on src and tgt_lines"""
+        index = index + 1  # linecache starts at 1
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
+        assert source_line, f"empty source line for index {index}"
+        assert tgt_line, f"empty tgt line for index {index}"
+        source_inputs = self.encode_line(self.tokenizer, source_line, self.max_source_length)
+        target_inputs = self.encode_line(self.tokenizer, tgt_line, self.max_target_length)
+        source_ids = source_inputs["input_ids"].squeeze()
+        target_ids = target_inputs["input_ids"].squeeze()
+        src_mask = source_inputs["attention_mask"].squeeze()
+        return {
+            "input_ids": source_ids,
+            "attention_mask": src_mask,
+            "labels": target_ids,
+        }
+    def encode_line(self, tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
+        """Only used by LegacyDataset"""
+        return tokenizer(
+            [line],
+            max_length=max_length,
+            padding="max_length" if pad_to_max_length else None,
+            truncation=True,
+            return_tensors=return_tensors,
+            **self.dataset_kwargs,
+        )
+    def collate_fn(self, batch) -> Dict[str, torch.Tensor]:
+        input_ids = torch.stack([x["input_ids"] for x in batch])
+        masks = torch.stack([x["attention_mask"] for x in batch])
+        target_ids = torch.stack([x["labels"] for x in batch])
+        pad_token_id = self.pad_token_id
+        y = trim_batch(target_ids, pad_token_id)
+        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
+        batch = {
+            "input_ids": source_ids,
+            "attention_mask": source_mask,
+            "labels": y,
+        }
+        return batch
+class Seq2SeqDataset(AbstractSeq2SeqDataset):
+    """A dataset that calls prepare_seq2seq_batch."""
+    def __getitem__(self, index) -> Dict[str, str]:
+        #print(self.dataset_kwargs['model_t'])
+        # if 't5' in self.dataset_kwargs['model_t']:
+        #     self.prefix = 'translate Graph to English: '
+        #     print('aac')
+        #     exit()
+        index = index + 1  # linecache starts at 1
+        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
+        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
+        assert source_line, f"empty source line for index {index}"
+        assert tgt_line, f"empty tgt line for index {index}"
+        return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
+    def collate_fn(self, batch):
+        """Call prepare_seq2seq_batch."""
+        batch_encoding: Dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
+            [x["src_texts"] for x in batch],
+            tgt_texts=[x["tgt_texts"] for x in batch],
+            max_length=self.max_source_length,
+            max_target_length=self.max_target_length,
+            return_tensors="pt",
+            **self.dataset_kwargs,
+        ).data
+        #lens = (batch_encoding['attention_mask'] == 1.).sum(dim=1).tolist()
+        batch_encoding["ids"] = torch.tensor([x["id"] for x in batch])
+        return batch_encoding
+class Seq2SeqDataCollator:
+    def __init__(self, tokenizer, data_args, tpu_num_cores=None):
+        self.tokenizer = tokenizer
+        self.pad_token_id = tokenizer.pad_token_id
+        assert (
+            self.pad_token_id is not None
+        ), f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
+        self.data_args = data_args
+        self.tpu_num_cores = tpu_num_cores
+        self.dataset_kwargs = {"add_prefix_space": isinstance(tokenizer, BartTokenizer)}
+        if data_args.src_lang is not None:
+            self.dataset_kwargs["src_lang"] = data_args.src_lang
+        if data_args.tgt_lang is not None:
+            self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
+    def __call__(self, batch) -> Dict[str, torch.Tensor]:
+        if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
+            batch = self._encode(batch)
+            input_ids, attention_mask, labels = (
+                batch["input_ids"],
+                batch["attention_mask"],
+                batch["labels"],
+            )
+        else:
+            input_ids = torch.stack([x["input_ids"] for x in batch])
+            attention_mask = torch.stack([x["attention_mask"] for x in batch])
+            labels = torch.stack([x["labels"] for x in batch])
+            labels = trim_batch(labels, self.pad_token_id)
+            input_ids, attention_mask = trim_batch(input_ids, self.pad_token_id, attention_mask=attention_mask)
+        if isinstance(self.tokenizer, T5Tokenizer):
+            decoder_input_ids = self._shift_right_t5(labels)
+        else:
+            decoder_input_ids = shift_tokens_right(labels, self.pad_token_id)
+        batch = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "labels": labels,
+        }
+        return batch
+    def _shift_right_t5(self, input_ids):
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = self.pad_token_id
+        return shifted_input_ids
+    def _encode(self, batch) -> Dict[str, torch.Tensor]:
+        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
+            [x["src_texts"] for x in batch],
+            tgt_texts=[x["tgt_texts"] for x in batch],
+            max_length=self.data_args.max_source_length,
+            max_target_length=self.data_args.max_target_length,
+            padding="max_length" if self.tpu_num_cores is not None else "longest",  # TPU hack
+            return_tensors="pt",
+            **self.dataset_kwargs,
+        )
+        return batch_encoding.data
+class SortishSampler(Sampler):
+    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
+    def __init__(self, data, batch_size, shuffle=True):
+        self.data, self.bs, self.shuffle = data, batch_size, shuffle
+    def __len__(self) -> int:
+        return len(self.data)
+    def __iter__(self):
+        return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
+def sortish_sampler_indices(data: List, bs: int, shuffle=True) -> np.array:
+    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
+    if not shuffle:
+        return np.argsort(np.array(data) * -1)
+    def key_fn(i):
+        return data[i]
+    idxs = np.random.permutation(len(data))
+    sz = bs * 50
+    ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
+    sort_idx = np.concatenate([sorted(s, key=key_fn, reverse=True) for s in ck_idx])
+    sz = bs
+    ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
+    max_ck = np.argmax([key_fn(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,
+    ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0]  # then make sure it goes first.
+    sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=np.int)
+    sort_idx = np.concatenate((ck_idx[0], sort_idx))
+    return sort_idx
+class DistributedSortishSampler(Sampler):
+    """Copied from torch DistributedSampler"""
+    def __init__(self, dataset, batch_size, num_replicas=None, rank=None, add_extra_examples=True, shuffle=True):
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        if add_extra_examples:
+            self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+            self.total_size = self.num_samples * self.num_replicas
+        else:
+            self.total_size = len(dataset)
+            self.num_samples = len(self.available_indices)
+        self.batch_size = batch_size
+        self.add_extra_examples = add_extra_examples
+        self.shuffle = shuffle
+    def __iter__(self) -> Iterable:
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+        sortish_data = [self.dataset.src_lens[i] for i in self.available_indices]
+        sortish_indices = sortish_sampler_indices(sortish_data, self.batch_size, shuffle=self.shuffle)
+        indices = [self.available_indices[i] for i in sortish_indices]
+        assert len(indices) == self.num_samples
+        return iter(indices)
+    @cached_property
+    def available_indices(self) -> np.array:
+        indices = list(range(len(self.dataset)))
+        # add extra samples to make it evenly divisible
+        indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        # subsample
+        available_indices = indices[self.rank : self.total_size : self.num_replicas]
+        return available_indices
+    def __len__(self):
+        return self.num_samples
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+logger = getLogger(__name__)
+def use_task_specific_params(model, task):
+    """Update config with summarization specific params."""
+    task_specific_params = model.config.task_specific_params
+    if task_specific_params is not None:
+        pars = task_specific_params.get(task, {})
+        logger.info(f"using task specific params for {task}: {pars}")
+        model.config.update(pars)
+def pickle_load(path):
+    """pickle.load(path)"""
+    with open(path, "rb") as f:
+        return pickle.load(f)
+def pickle_save(obj, path):
+    """pickle.dump(obj, path)"""
+    with open(path, "wb") as f:
+        return pickle.dump(obj, f)
+def flatten_list(summary_ids: List[List]):
+    return [x for x in itertools.chain.from_iterable(summary_ids)]
+def save_json(content, path, indent=4, **json_dump_kwargs):
+    with open(path, "w") as f:
+        json.dump(content, f, indent=indent, **json_dump_kwargs)
+def load_json(path):
+    with open(path) as f:
+        return json.load(f)
+ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
+def extract_rouge_mid_statistics(dct):
+    new_dict = {}
+    for k1, v1 in dct.items():
+        mid = v1.mid
+        new_dict[k1] = {stat: round(getattr(mid, stat), 4) for stat in ["precision", "recall", "fmeasure"]}
+    return new_dict
+def calculate_rouge(
+    pred_lns: List[str],
+    tgt_lns: List[str],
+    use_stemmer=True,
+    rouge_keys=ROUGE_KEYS,
+    return_precision_and_recall=False,
+    bootstrap_aggregation=True,
+    newline_sep=True,
+) -> Dict:
+    """Calculate rouge using rouge_scorer package.
+    Args:
+        pred_lns: list of summaries generated by model
+        tgt_lns: list of groundtruth summaries (e.g. contents of val.target)
+        use_stemmer:  Bool indicating whether Porter stemmer should be used to
+        strip word suffixes to improve matching.
+        rouge_keys:  which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum
+        return_precision_and_recall: (False) whether to also return precision and recall.
+        bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False
+            this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]``
+        newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL
+        on multi sentence summaries (CNN/DM dataset).
+    Returns:
+         Dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
+    """
+    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
+    aggregator = scoring.BootstrapAggregator()
+    for pred, tgt in zip(tgt_lns, pred_lns):
+        # rougeLsum expects "\n" separated sentences within a summary
+        if newline_sep:
+            pred = add_newline_to_end_of_each_sentence(pred)
+            tgt = add_newline_to_end_of_each_sentence(tgt)
+        scores = scorer.score(pred, tgt)
+        aggregator.add_scores(scores)
+    if bootstrap_aggregation:
+        result = aggregator.aggregate()
+        if return_precision_and_recall:
+            return extract_rouge_mid_statistics(result)  # here we return dict
+        else:
+            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
+    else:
+        return aggregator._scores  # here we return defaultdict(list)
+# Utilities for freezing parameters and checking whether they are frozen
+def freeze_params(model: nn.Module):
+    """Set requires_grad=False for each of model.parameters()"""
+    for par in model.parameters():
+        par.requires_grad = False
+def freeze_embeds(model):
+    """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
+    model_type = model.config.model_type
+    if model_type == "t5":
+        freeze_params(model.shared)
+        for d in [model.encoder, model.decoder]:
+            freeze_params(d.embed_tokens)
+    elif model_type == "fsmt":
+        for d in [model.model.encoder, model.model.decoder]:
+            freeze_params(d.embed_positions)
+            freeze_params(d.embed_tokens)
+    else:
+        freeze_params(model.model.shared)
+        for d in [model.model.encoder, model.model.decoder]:
+            freeze_params(d.embed_positions)
+            freeze_params(d.embed_tokens)
+def grad_status(model: nn.Module) -> Iterable:
+    return (par.requires_grad for par in model.parameters())
+def any_requires_grad(model: nn.Module) -> bool:
+    return any(grad_status(model))
+def assert_all_frozen(model):
+    model_grads: List[bool] = list(grad_status(model))
+    n_require_grad = sum(lmap(int, model_grads))
+    npars = len(model_grads)
+    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"
+def assert_not_all_frozen(model):
+    model_grads: List[bool] = list(grad_status(model))
+    npars = len(model_grads)
+    assert any(model_grads), f"none of {npars} weights require grad"
+def parse_numeric_n_bool_cl_kwargs(unparsed_args: List[str]) -> Dict[str, Union[int, float, bool]]:
+    """
+    Parse an argv list of unspecified command line args to a dict.
+    Assumes all values are either numeric or boolean in the form of true/false.
+    """
+    result = {}
+    assert len(unparsed_args) % 2 == 0, f"got odd number of unparsed args: {unparsed_args}"
+    num_pairs = len(unparsed_args) // 2
+    for pair_num in range(num_pairs):
+        i = 2 * pair_num
+        assert unparsed_args[i].startswith("--")
+        if unparsed_args[i + 1].lower() == "true":
+            value = True
+        elif unparsed_args[i + 1].lower() == "false":
+            value = False
+        else:
+            try:
+                value = int(unparsed_args[i + 1])
+            except ValueError:
+                value = float(unparsed_args[i + 1])  # this can raise another informative ValueError
+        result[unparsed_args[i][2:]] = value
+    return result
+def write_txt_file(ordered_tgt, path):
+    f = Path(path).open("w")
+    for ln in ordered_tgt:
+        f.write(ln + "\n")
+        f.flush()
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i : i + n]

utils/verbalisation_module.py ADDED Viewed

	@@ -0,0 +1,300 @@

+from utils.finetune import Graph2TextModule
+from typing import Dict, List, Tuple, Union, Optional
+import torch
+import re
+if torch.cuda.is_available():
+    DEVICE = 'cuda'
+else:
+    DEVICE = 'cpu'
+    print('CUDA NOT AVAILABLE')
+CHECKPOINT = 'base/t5-base_13881_val_avg_bleu=68.1000-step_count=5.ckpt'
+MAX_LENGTH = 384
+SEED = 42
+class VerbModule():
+    def __init__(self, override_args: Dict[str, str] = None):
+        # Model
+        if not override_args:
+            override_args = {}
+        self.g2t_module = Graph2TextModule.load_from_checkpoint(CHECKPOINT, strict=False, **override_args)
+        self.tokenizer = self.g2t_module.tokenizer
+        # Unk replacer
+        self.vocab = self.tokenizer.get_vocab()
+        self.convert_some_japanese_characters = True
+        self.unk_char_replace_sliding_window_size = 2
+        self.unknowns = []
+    def __generate_verbalisations_from_inputs(self, inputs: Union[str, List[str]]):
+        try:
+            inputs_encoding = self.tokenizer.prepare_seq2seq_batch(
+                inputs, truncation=True, max_length=MAX_LENGTH, return_tensors='pt'
+            )
+            inputs_encoding = {k: v.to(DEVICE) for k, v in inputs_encoding.items()}
+            self.g2t_module.model.eval()
+            with torch.no_grad():
+                gen_output = self.g2t_module.model.generate(
+                    inputs_encoding['input_ids'],
+                    attention_mask=inputs_encoding['attention_mask'],
+                    use_cache=True,
+                    decoder_start_token_id = self.g2t_module.decoder_start_token_id,
+                    num_beams= self.g2t_module.eval_beams,
+                    max_length= self.g2t_module.eval_max_length,
+                    length_penalty=1.0
+                )
+        except Exception:
+            print(inputs)
+            raise
+        return gen_output
+    '''
+    We create this function as an alteration from [this one](https://github.com/huggingface/transformers/blob/198c335d219a5eb4d3f124fdd1ce1a9cd9f78a9b/src/transformers/tokenization_utils_fast.py#L537), mainly because the official 'tokenizer.decode' treats all special tokens the same, while we want to drop all special tokens from the decoded sentence EXCEPT for the <unk> token, which we will replace later on.
+    '''
+    def __decode_ids_to_string_custom(
+        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
+    ) -> str:
+        filtered_tokens = self.tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=False)
+        # Do not remove special tokens yet
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separatly for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and\
+                token != self.tokenizer.unk_token and\
+                token in self.tokenizer.all_special_tokens:
+                continue
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.tokenizer.convert_tokens_to_string(current_sub_text))
+        text = " ".join(sub_texts)
+        if clean_up_tokenization_spaces:
+            clean_text = self.tokenizer.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+    def __decode_sentences(self, encoded_sentences: Union[str, List[str]]):
+        if type(encoded_sentences) == str:
+            encoded_sentences = [encoded_sentences]
+        decoded_sentences = [self.__decode_ids_to_string_custom(i, skip_special_tokens=True) for i in encoded_sentences]
+        return decoded_sentences
+    def verbalise_sentence(self, inputs: Union[str, List[str]]):
+        if type(inputs) == str:
+            inputs = [inputs]
+        gen_output = self.__generate_verbalisations_from_inputs(inputs)
+        decoded_sentences = self.__decode_sentences(gen_output)
+        if len(decoded_sentences) == 1:
+            return decoded_sentences[0]
+        else:
+            return decoded_sentences
+    def verbalise_triples(self, input_triples: Union[Dict[str, str], List[Dict[str, str]], List[List[Dict[str, str]]]]):
+        if type(input_triples) == dict:
+            input_triples = [input_triples]
+        verbalisation_inputs = []
+        for triple in input_triples:
+            if type(triple) == dict:
+                assert 'subject' in triple
+                assert 'predicate' in triple
+                assert 'object' in triple
+                verbalisation_inputs.append(
+                    f'translate Graph to English: <H> {triple["subject"]} <R> {triple["predicate"]} <T> {triple["object"]}'
+                )
+            elif type(triple) == list:
+                input_sentence = ['translate Graph to English:']
+                for subtriple in triple:
+                    assert 'subject' in subtriple
+                    assert 'predicate' in subtriple
+                    assert 'object' in subtriple
+                    input_sentence.append(f'<H> {subtriple["subject"]}')
+                    input_sentence.append(f'<R> {subtriple["predicate"]}')
+                    input_sentence.append(f'<T> {subtriple["object"]}')
+                verbalisation_inputs.append(
+                    ' '.join(input_sentence)
+                )
+        return self.verbalise_sentence(verbalisation_inputs)
+    def verbalise(self, input: Union[str, List, Dict]):
+        try:
+            if (type(input) == str) or (type(input) == list and type(input[0]) == str):
+                return self.verbalise_sentence(input)
+            elif (type(input) == dict) or (type(input) == list and type(input[0]) == dict):
+                return self.verbalise_triples(input)
+            else:
+                return self.verbalise_triples(input)
+        except Exception:
+            print(f'ERROR VERBALISING {input}')
+            raise
+    def add_label_to_unk_replacer(self, label: str):
+        N = self.unk_char_replace_sliding_window_size
+        self.unknowns.append({})
+        # Some pre-processing of labels to normalise some characters
+        if self.convert_some_japanese_characters:
+            label = label.replace('（','(')
+            label = label.replace('）',')')
+            label = label.replace('〈','<')
+            label = label.replace('／','/')
+            label = label.replace('〉','>')
+        label_encoded = self.tokenizer.encode(label)
+        label_tokens = self.tokenizer.convert_ids_to_tokens(label_encoded)
+        # Here, we also remove </s> (eos) and <pad> tokens in the replacing key, because:
+        # 1) When the whole label is all unk:
+        #   label_token_to_string would be '<unk></s>', meaning the replacing key (which is the same) only replaces
+        #   the <unk> if it appears at the end of the sentence, which is not the desired effect.
+        #   But since this means ANY <unk> will be replaced by this, it would be good to only replace keys that are <unk>
+        #   on the last replacing pass.
+        # 2) On other cases, then the unk is in the label but not in its entirety, like in the start/end, it might
+        #   involve the starting <pad> token or the ending <eos> token on the replacing key, again forcing the replacement
+        #   to only happen if the label appears in the end of the sentence.
+        label_tokens = [t for t in label_tokens if t not in [
+            self.tokenizer.eos_token, self.tokenizer.pad_token
+        ]]
+        label_token_to_string = self.tokenizer.convert_tokens_to_string(label_tokens)
+        unk_token_to_string = self.tokenizer.convert_tokens_to_string([self.tokenizer.unk_token])
+        #print(label_encoded,label_tokens,label_token_to_string)
+        match_unks_in_label = re.findall('(?:(?: )*<unk>(?: )*)+', label_token_to_string)
+        if len(match_unks_in_label) > 0:
+            # If the whole label is made of UNK
+            if (match_unks_in_label[0]) == label_token_to_string:
+                #print('Label is all unks')
+                self.unknowns[-1][label_token_to_string.strip()] = label
+            # Else, there should be non-UNK characters in the label
+            else:
+                #print('Label is NOT all unks')
+                # Analyse the label with a sliding window of size N (N before, N ahead)
+                for idx, token in enumerate(label_tokens):
+                    idx_before = max(0,idx-N)
+                    idx_ahead = min(len(label_tokens), idx+N+1)
+                    # Found a UNK
+                    if token == self.tokenizer.unk_token:
+                        # In case multiple UNK, exclude UNKs seen after this one, expand window to other side if possible
+                        if len(match_unks_in_label) > 1:
+                            #print(idx)
+                            #print(label_tokens)
+                            #print(label_tokens[idx_before:idx_ahead])
+                            #print('HERE!')
+                            # Reduce on the right, expanding on the left
+                            while self.tokenizer.unk_token in label_tokens[idx+1:idx_ahead]:
+                                idx_before = max(0,idx_before-1)
+                                idx_ahead = min(idx+2, idx_ahead-1)
+                                #print(label_tokens[idx_before:idx_ahead])
+                            # Now just reduce on the left
+                            while self.tokenizer.unk_token in label_tokens[idx_before:idx]:
+                                idx_before = min(idx-1,idx_before+2)
+                                #print(label_tokens[idx_before:idx_ahead])
+                        span = self.tokenizer.convert_tokens_to_string(label_tokens[idx_before:idx_ahead])
+                        # First token of the label is UNK
+                        if idx == 1 and label_tokens[0] == '▁':
+                            #print('Label begins with unks')
+                            to_replace = '^' + re.escape(span).replace(
+                                    re.escape(unk_token_to_string),
+                                    '.+?'
+                                )
+                            replaced_span = re.search(
+                                to_replace,
+                                label
+                            )[0]
+                            self.unknowns[-1][span.strip()] = replaced_span
+                        # Last token of the label is UNK
+                        elif idx == len(label_tokens)-2 and label_tokens[-1] == self.tokenizer.eos_token:
+                            #print('Label ends with unks')
+                            pre_idx = self.tokenizer.convert_tokens_to_string(label_tokens[idx_before:idx])
+                            pre_idx_unk_counts = pre_idx.count(unk_token_to_string)
+                            to_replace = re.escape(span).replace(
+                                    re.escape(unk_token_to_string),
+                                    f'[^{re.escape(pre_idx)}]+?'
+                                ) + '$'
+                            if pre_idx.strip() == '':
+                                to_replace = to_replace.replace('[^]', '(?<=\s)[^a-zA-Z0-9]')
+                            replaced_span = re.search(
+                                to_replace,
+                                label
+                            )[0]
+                            self.unknowns[-1][span.strip()] = replaced_span
+                        # A token in-between the label is UNK
+                        else:
+                            #print('Label has unks in the middle')
+                            pre_idx = self.tokenizer.convert_tokens_to_string(label_tokens[idx_before:idx])
+                            to_replace = re.escape(span).replace(
+                                re.escape(unk_token_to_string),
+                                f'[^{re.escape(pre_idx)}]+?'
+                            )
+                            #If there is nothing behind the ??, because it is in the middle but the previous token is also
+                            #a ??, then we would end up with to_replace beginning with [^], which we can't have
+                            if pre_idx.strip() == '':
+                                to_replace = to_replace.replace('[^]', '(?<=\s)[^a-zA-Z0-9]')
+                            replaced_span = re.search(
+                                to_replace,
+                                label
+                            )
+                            if replaced_span:
+                                span = re.sub(r'\s([?.!",](?:\s|$))', r'\1', span.strip())
+                                self.unknowns[-1][span] = replaced_span[0]
+    def replace_unks_on_sentence(self, sentence: str, loop_n : int = 3, empty_after : bool = False):
+        # Loop through in case the labels are repeated, maximum of three times
+        while '<unk>' in sentence and loop_n > 0:
+            loop_n -= 1
+            for unknowns in self.unknowns:
+                for k,v in unknowns.items():
+                    # Leave to replace all-unk labels at the last pass
+                    if k == '<unk>' and loop_n > 0:
+                        continue
+                    # In case it is because the first letter of the sentence has been uppercased
+                    if not k in sentence and k[0] == k[0].lower() and k[0].upper() == sentence[0]:
+                        k = k[0].upper() + k[1:]
+                        v = v[0].upper() + v[1:]
+                    # In case it is because a double space is found where it should not be
+                    elif not k in sentence and len(re.findall(r'\s{2,}',k))>0:
+                        k = re.sub(r'\s+', ' ', k)
+                    #print(k,'/',v,'/',sentence)
+                    sentence = sentence.replace(k.strip(),v.strip(),1)
+                    #sentence = re.sub(k, v, sentence)
+            # Removing final doublespaces
+            sentence = re.sub(r'\s+', ' ', sentence).strip()
+            # Removing spaces before punctuation
+            sentence = re.sub(r'\s([?.!",](?:\s|$))', r'\1', sentence)
+        if empty_after:
+            self.unknowns = []
+        return sentence
+if __name__ == '__main__':
+    verb_module = VerbModule()
+    verbs = verb_module.verbalise('translate Graph to English: <H> World Trade Center <R> height <T> 200 meter <H> World Trade Center <R> is a <T> tower')
+    print(verbs)

utils/wikidata_utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import json
+import random
+import uuid
+import numpy as np
+import time
+import requests
+import traceback
+import pdb
+import math
+import ast
+import pandas as pd
+import pickle
+from qwikidata.linked_data_interface import get_entity_dict_from_api
+from qwikidata.sparql import return_sparql_query_results
+from urllib3.exceptions import MaxRetryError, ConnectionError
+from qwikidata.linked_data_interface import LdiResponseNotOk
+import hashlib
+class CachedWikidataAPI():
+    def __init__(self, cache_path = 'entity_cache.p', save_every_x_queries=1):
+        self.save_every_x_queries = save_every_x_queries
+        self.x_queries_passed = 0
+        self.languages = ['en','fr','es','pt','pt-br','it','de']
+        self.cache_path = cache_path
+        try:
+            with open(self.cache_path,'rb') as f:
+                self.entity_cache = pickle.load(f)
+        except FileNotFoundError:
+            self.entity_cache = {}
+    def get_unique_id_from_str(self, my_str):
+        return hashlib.md5(str.encode(my_str)).hexdigest()
+    def save_entity_cache(self, force=False):
+        if force:
+            self.x_queries_passed = self.save_every_x_queries
+        self.x_queries_passed = self.x_queries_passed+1
+        if self.x_queries_passed >= self.save_every_x_queries:
+            with open(self.cache_path,'wb') as f:
+                pickle.dump(self.entity_cache,f)
+            self.x_queries_passed = 0
+    def get_entity(self, item_id):
+        if item_id in self.entity_cache:
+            return self.entity_cache[item_id]
+        while True:
+            try:
+                entity = get_entity_dict_from_api(item_id)
+                self.entity_cache[item_id] = entity
+                self.save_entity_cache()
+                return entity
+            except (ConnectionError, MaxRetryError) as e:
+                #traceback.print_exc()
+                time.sleep(1)
+                continue
+            except LdiResponseNotOk:
+                #traceback.print_exc()
+                self.entity_cache[item_id] = 'deleted'
+                self.save_entity_cache()
+                return 'deleted'
+    def get_label(self, item, non_language_set=False):
+        if type(item) == str:
+            entity = self.get_entity(item)
+            if entity == 'deleted':
+                return (entity, 'none')
+            labels = entity['labels' if 'labels' in entity else 'lemmas']
+        elif type(item) == dict:
+            if 'labels' in item:
+                labels = item['labels']
+            elif 'lemmas' in item:
+                labels = item['lemmas']
+        for l in self.languages:
+            if l in labels:
+                return (labels[l]['value'], l)
+        if non_language_set:
+            all_labels = list(labels.keys())
+            if len(all_labels)>0:
+                return (labels[all_labels[0]]['value'], all_labels[0])
+        return ('no-label', 'none')
+    def get_desc(self, item, non_language_set=False):
+        if type(item) == str:
+            entity = self.get_entity(item)
+            if entity == 'deleted':
+                return (entity, 'none')
+            descriptions = entity['descriptions']
+        elif type(item) == dict:
+            if 'descriptions' in item:
+                descriptions = item['descriptions']
+        for l in self.languages:
+            if l in descriptions:
+                return (descriptions[l]['value'], l)
+        if non_language_set:
+            all_descriptions = list(descriptions.keys())
+            if len(all_descriptions)>0:
+                return (descriptions[all_descriptions[0]]['value'], all_descriptions[0])
+        return ('no-desc', 'none')
+    def get_alias(self, item, non_language_set=False):
+        if type(item) == str:
+            entity = self.get_entity(item)
+            if entity == 'deleted':
+                return ([entity], 'none')
+            aliases = entity['aliases']
+        elif type(item) == dict:
+            if 'aliases' in item:
+                aliases = item['aliases']
+        for l in self.languages:
+            if l in aliases:
+                return ([alias['value'] for alias in aliases[l]], l)
+        if non_language_set:
+            all_aliases = list(aliases.keys())
+            if len(all_aliases)>0:
+                return (aliases[all_aliases[0]]['value'], all_aliases[0])
+                return ([alias['value'] for alias in aliases[all_aliases[0]]], all_aliases[0])
+        return ('no-alias', 'none')
+    def get_datatype(self, item):
+        try:
+            if type(item) == str:
+                entity = self.get_entity(item)
+                if entity == 'deleted':
+                    return entity
+                datatype = entity['datatype']
+            elif type(item) == dict:
+                datatype = item['datatype']
+            return datatype
+        except KeyError:
+            return 'none'
+    def get_claim_values_of(self, item, property_id):
+        if type(item) == str:
+            entity = self.get_entity(item)
+            if entity == 'deleted':
+                return entity
+            claims = entity['claims']
+        elif type(item) == dict:
+            claims = item['claims']
+        if property_id in claims:
+            instance_of_claims = claims[property_id]
+            return [i['mainsnak']['datavalue']['value']['id'] for i in instance_of_claims]
+        else:
+            return []
+    def query_sparql_endpoint(self, sparql_query):
+        sparql_query_id = self.get_unique_id_from_str(sparql_query)
+        if sparql_query_id in self.entity_cache:
+            return self.entity_cache[sparql_query_id]
+        else:
+            wikidata_sparql_url = 'https://query.wikidata.org/sparql'
+            try:
+                while True:
+                    res = requests.get(wikidata_sparql_url, params={"query": sparql_query, "format": "json"})
+                    if res.status_code in (429,504):
+                        time.sleep(1)
+                        continue
+                    elif res.status_code == 200:
+                        res = res.json()
+                        self.entity_cache[sparql_query_id] = res
+                        self.save_entity_cache()
+                        return res
+                    else:
+                        print(res.status_code)
+                        raise Exception
+            except json.JSONDecodeError as e:
+                #pdb.set_trace()
+                print(res, res.__dict__)
+                raise e