Spaces:

DeepLearning101
/

IE101TW

Sleeping

App Files Files Community

DeepLearning101 commited on Oct 15, 2023

Commit

e95b4e9

1 Parent(s): 08f4077

Upload 3 files

Browse files

Files changed (3) hide show

models/multiple_choice/duma.py +355 -0
models/multiple_choice/multiple_choice.py +196 -0
models/multiple_choice/multiple_choice_tag.py +271 -0

models/multiple_choice/duma.py ADDED Viewed

	@@ -0,0 +1,355 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2022/4/12 12:12 下午
+# @Author  : JianingWang
+# @File    : duma.py
+import math
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
+from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel
+from transformers.models.albert.modeling_albert import AlbertModel, AlbertPreTrainedModel
+from transformers.models.megatron_bert.modeling_megatron_bert import MegatronBertModel, MegatronBertPreTrainedModel
+from transformers.modeling_outputs import MultipleChoiceModelOutput
+def split_context_query(sequence_output, pq_end_pos, input_ids):
+    context_max_len = sequence_output.size(1)
+    query_max_len = sequence_output.size(1)
+    sep_tok_len = 1  # [SEP]
+    context_sequence_output = sequence_output.new(
+        torch.Size((sequence_output.size(0), context_max_len, sequence_output.size(2)))).zero_()
+    query_sequence_output = sequence_output.new_zeros(
+        (sequence_output.size(0), query_max_len, sequence_output.size(2)))
+    query_attention_mask = sequence_output.new_zeros((sequence_output.size(0), query_max_len))
+    context_attention_mask = sequence_output.new_zeros((sequence_output.size(0), context_max_len))
+    for i in range(0, sequence_output.size(0)):
+        p_end = pq_end_pos[i][0]
+        q_end = pq_end_pos[i][1]
+        context_sequence_output[i, :min(context_max_len, p_end)] = sequence_output[i, 1: 1 + min(context_max_len, p_end)]
+        idx = min(query_max_len, q_end - p_end - sep_tok_len)
+        query_sequence_output[i, :idx] = sequence_output[i, p_end + sep_tok_len + 1: p_end + sep_tok_len + 1 + min(q_end - p_end - sep_tok_len, query_max_len)]
+        query_attention_mask[i, :idx] = sequence_output.new_ones((1, query_max_len))[0, :idx]
+        context_attention_mask[i, : min(context_max_len, p_end)] = sequence_output.new_ones((1, context_max_len))[0, : min(context_max_len, p_end)]
+    return context_sequence_output, query_sequence_output, context_attention_mask, query_attention_mask
+class BertCoAttention(nn.Module):
+    def __init__(self, config):
+        super(BertCoAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, context_states, query_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+        mixed_query_layer = self.query(query_states)
+        extended_attention_mask = attention_mask[:, None, None, :]
+        # extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        attention_mask = extended_attention_mask
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder"s padding tokens are not attended to.
+        if encoder_hidden_states is not None:
+            mixed_key_layer = self.key(encoder_hidden_states)
+            mixed_value_layer = self.value(encoder_hidden_states)
+            attention_mask = encoder_attention_mask
+        else:
+            mixed_key_layer = self.key(context_states)
+            mixed_value_layer = self.value(context_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        # outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        outputs = context_layer
+        return outputs
+class BertDUMAForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super(BertDUMAForMultipleChoice, self).__init__(config)
+        self.bert = BertModel(config)
+        self.classifier_2 = nn.Linear(2 * config.hidden_size, 1)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.bert_att = BertCoAttention(config)
+        self.init_weights()
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                inputs_embeds=None, labels=None, pq_end_pos=None, iter=1):
+        num_choices = input_ids.shape[1]
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_head_mask = head_mask.view(-1, head_mask.size(-1)) if head_mask is not None else None
+        flat_inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1)) if inputs_embeds is not None else None
+        outputs = self.bert(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=flat_head_mask,
+            inputs_embeds=flat_inputs_embeds
+        )
+        sequence_output = outputs[0]
+        pq_end_pos = pq_end_pos.view(-1, pq_end_pos.size(-1))
+        context_sequence_output, query_sequence_output, context_attention_mask, query_attention_mask = \
+            split_context_query(sequence_output, pq_end_pos, input_ids)
+        for _ in range(0, iter):
+            cq_biatt_output = self.bert_att(context_sequence_output, query_sequence_output, context_attention_mask)
+            qc_biatt_output = self.bert_att(query_sequence_output, context_sequence_output, query_attention_mask)
+            query_sequence_output = cq_biatt_output
+            context_sequence_output = qc_biatt_output
+        cat_output = torch.cat([torch.mean(qc_biatt_output, 1), torch.mean(cq_biatt_output, 1)], 1)
+        pooled_output = self.dropout(cat_output)
+        logits = self.classifier_2(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            outputs = (loss,) + outputs
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+class RobertaDUMAForMultipleChoice(RobertaPreTrainedModel):
+    def __init__(self, config):
+        super(RobertaDUMAForMultipleChoice, self).__init__(config)
+        self.roberta = RobertaModel(config)
+        self.classifier_2 = nn.Linear(2 * config.hidden_size, 1)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.bert_att = BertCoAttention(config)
+        self.init_weights()
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                inputs_embeds=None, labels=None, pq_end_pos=None, iter=1):
+        num_choices = input_ids.shape[1]
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_head_mask = head_mask.view(-1, head_mask.size(-1)) if head_mask is not None else None
+        flat_inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1)) if inputs_embeds is not None else None
+        outputs = self.roberta(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=flat_head_mask,
+            inputs_embeds=flat_inputs_embeds
+        )
+        sequence_output = outputs[0]
+        pq_end_pos = pq_end_pos.view(-1, pq_end_pos.size(-1))
+        context_sequence_output, query_sequence_output, context_attention_mask, query_attention_mask = \
+            split_context_query(sequence_output, pq_end_pos, input_ids)
+        for _ in range(0, iter):
+            cq_biatt_output = self.bert_att(context_sequence_output, query_sequence_output, context_attention_mask)
+            qc_biatt_output = self.bert_att(query_sequence_output, context_sequence_output, query_attention_mask)
+            query_sequence_output = cq_biatt_output
+            context_sequence_output = qc_biatt_output
+        cat_output = torch.cat([torch.mean(qc_biatt_output, 1), torch.mean(cq_biatt_output, 1)], 1)
+        pooled_output = self.dropout(cat_output)
+        logits = self.classifier_2(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            outputs = (loss,) + outputs
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+class AlbertDUMAForMultipleChoice(AlbertPreTrainedModel):
+    def __init__(self, config):
+        super(AlbertDUMAForMultipleChoice, self).__init__(config)
+        self.albert = AlbertModel(config)
+        self.classifier_2 = nn.Linear(2 * config.hidden_size, 1)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.bert_att = BertCoAttention(config)
+        self.init_weights()
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                inputs_embeds=None, labels=None, pq_end_pos=None, iter=1):
+        num_choices = input_ids.shape[1]
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_head_mask = head_mask.view(-1, head_mask.size(-1)) if head_mask is not None else None
+        flat_inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1)) if inputs_embeds is not None else None
+        outputs = self.albert(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=flat_head_mask,
+            inputs_embeds=flat_inputs_embeds
+        )
+        sequence_output = outputs[0]
+        pq_end_pos = pq_end_pos.view(-1, pq_end_pos.size(-1))
+        context_sequence_output, query_sequence_output, context_attention_mask, query_attention_mask = \
+            split_context_query(sequence_output, pq_end_pos, input_ids)
+        for _ in range(0, iter):
+            cq_biatt_output = self.bert_att(context_sequence_output, query_sequence_output, context_attention_mask)
+            qc_biatt_output = self.bert_att(query_sequence_output, context_sequence_output, query_attention_mask)
+            query_sequence_output = cq_biatt_output
+            context_sequence_output = qc_biatt_output
+        cat_output = torch.cat([torch.mean(qc_biatt_output, 1), torch.mean(cq_biatt_output, 1)], 1)
+        pooled_output = self.dropout(cat_output)
+        logits = self.classifier_2(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            outputs = (loss,) + outputs
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+class MegatronDumaForMultipleChoice(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super(MegatronDumaForMultipleChoice, self).__init__(config)
+        self.bert = MegatronBertModel(config)
+        self.classifier_2 = nn.Linear(2 * config.hidden_size, 1)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.bert_att = BertCoAttention(config)
+        self.init_weights()
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                inputs_embeds=None, labels=None, pq_end_pos=None, iter=1):
+        num_choices = input_ids.shape[1]
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_head_mask = head_mask.view(-1, head_mask.size(-1)) if head_mask is not None else None
+        flat_inputs_embeds = inputs_embeds.view(-1, inputs_embeds.size(-1)) if inputs_embeds is not None else None
+        outputs = self.bert(
+            input_ids=flat_input_ids,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=flat_head_mask,
+            inputs_embeds=flat_inputs_embeds
+        )
+        sequence_output = outputs[0]
+        pq_end_pos = pq_end_pos.view(-1, pq_end_pos.size(-1))
+        context_sequence_output, query_sequence_output, context_attention_mask, query_attention_mask = \
+            split_context_query(sequence_output, pq_end_pos, input_ids)
+        for _ in range(0, iter):
+            cq_biatt_output = self.bert_att(context_sequence_output, query_sequence_output, context_attention_mask)
+            qc_biatt_output = self.bert_att(query_sequence_output, context_sequence_output, query_attention_mask)
+            query_sequence_output = cq_biatt_output
+            context_sequence_output = qc_biatt_output
+        cat_output = torch.cat([torch.mean(qc_biatt_output, 1), torch.mean(cq_biatt_output, 1)], 1)
+        pooled_output = self.dropout(cat_output)
+        logits = self.classifier_2(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            outputs = (loss,) + outputs
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)

models/multiple_choice/multiple_choice.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2022/4/16 12:10 下午
+# @Author  : JianingWang
+# @File    : multiple_choice.py
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+# from transformers import MegatronBertPreTrainedModel, MegatronBertModel
+from transformers.models.megatron_bert import MegatronBertPreTrainedModel, MegatronBertModel
+from transformers.modeling_outputs import MultipleChoiceModelOutput
+class MegatronBertForMultipleChoice(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = MegatronBertModel(config)
+        # classifier_dropout = (
+        #     config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        # )
+        classifier_dropout = 0.2
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            pseudo=None
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1] # [batch_size, num_choices, hidden_dim]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output) # [batch_size, num_choices, 1]
+        reshaped_logits = logits.view(-1, num_choices) # [batch_size, num_choices]
+        loss = None
+        if labels is not None:
+            if pseudo is None:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(reshaped_logits, labels)
+            else:
+                loss_fct = CrossEntropyLoss(reduction="none")
+                loss = loss_fct(reshaped_logits, labels)
+                weight = 1 - pseudo * 0.9
+                loss *= weight
+                loss = loss.mean()
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class MegatronBertRDropForMultipleChoice(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = MegatronBertModel(config)
+        # classifier_dropout = (
+        #     config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        # )
+        classifier_dropout = 0.2
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        logits_list = []
+        for i in range(2):
+            outputs = self.bert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            pooled_output = outputs[1]
+            pooled_output = self.dropout(pooled_output)
+            logits = self.classifier(pooled_output)
+            logits_list.append(logits.view(-1, num_choices))
+        loss = None
+        alpha = 1.0
+        for logits in logits_list:
+            if labels is not None:
+                loss_fct = CrossEntropyLoss()
+                l = loss_fct(logits, labels)
+                if loss:
+                    loss += alpha * l
+                else:
+                    loss = alpha * l
+        if loss is not None:
+            p = torch.log_softmax(logits_list[0], dim=-1)
+            p_tec = torch.exp(p)
+            q = torch.log_softmax(logits_list[-1], dim=-1)
+            q_tec = torch.exp(q)
+            kl_loss = F.kl_div(p, q_tec, reduction="none").sum()
+            reverse_kl_loss = F.kl_div(q, p_tec, reduction="none").sum()
+            loss += 0.5 * (kl_loss + reverse_kl_loss) / 2.
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=logits_list[0],
+            hidden_states=None,
+            attentions=None
+        )

models/multiple_choice/multiple_choice_tag.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# -*- coding: utf-8 -*-
+# @Time    : 2022/3/3 7:59 下午
+# @Author  : JianingWang
+# @File    : multiple_choice.py
+import torch
+from roformer import RoFormerPreTrainedModel, RoFormerModel
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import MegatronBertPreTrainedModel, MegatronBertModel
+from transformers.modeling_outputs import MultipleChoiceModelOutput
+from transformers.models.bert import BertPreTrainedModel, BertModel
+class BertForTagMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size * 2, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            pseudo=None
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        w = torch.logical_and(input_ids >= min(self.config.start_token_ids), input_ids <= max(self.config.start_token_ids))
+        start_index = w.nonzero()[:, 1].view(-1, 2)
+        # <start_entity> + <end_entity> 进分类
+        pooled_output = torch.cat([torch.cat([x[y[0], :], x[y[1], :]]).unsqueeze(0) for x, y in zip(outputs.last_hidden_state, start_index)])
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            if pseudo is None:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(reshaped_logits, labels)
+            else:
+                loss_fct = CrossEntropyLoss(reduction="none")
+                loss = loss_fct(reshaped_logits, labels)
+                weight = 1 - pseudo * 0.9
+                loss *= weight
+                loss = loss.mean()
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class RoFormerForTagMultipleChoice(RoFormerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.roformer = RoFormerModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size * 2, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.roformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        w = torch.logical_and(input_ids >= min(self.config.start_token_ids), input_ids <= max(self.config.start_token_ids))
+        start_index = w.nonzero()[:, 1].view(-1, 2)
+        # <start_entity> + <end_entity> 进分类
+        pooled_output = torch.cat([torch.cat([x[y[0], :], x[y[1], :]]).unsqueeze(0) for x, y in zip(outputs.last_hidden_state, start_index)])
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class MegatronBertForTagMultipleChoice(MegatronBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(0.2)
+        self.classifier = nn.Linear(config.hidden_size * 2, 1)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            pseudo=None
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        w = torch.logical_and(input_ids >= min(self.config.start_token_ids), input_ids <= max(self.config.start_token_ids))
+        start_index = w.nonzero()[:, 1].view(-1, 2)
+        # <start_entity> + <end_entity> 进分类
+        pooled_output = torch.cat([torch.cat([x[y[0], :], x[y[1], :]]).unsqueeze(0) for x, y in zip(outputs.last_hidden_state, start_index)])
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        loss = None
+        if labels is not None:
+            if pseudo is None:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(reshaped_logits, labels)
+            else:
+                loss_fct = CrossEntropyLoss(reduction="none")
+                loss = loss_fct(reshaped_logits, labels)
+                weight = 1 - pseudo*0.9
+                loss *= weight
+                loss = loss.mean()
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )