| import argparse | |
| import json | |
| import numpy as np | |
| import tqdm | |
| from pathlib import Path | |
| from pprint import pprint | |
| from collections import defaultdict, Counter | |
| from transformers import AutoTokenizer | |
| import scrl.utils as utils | |
| from scrl.model import load_checkpoint | |
| from scrl.eval_metrics import compute_token_f1, rouge_scorer, ROUGE_TYPES | |
| from nltk import word_tokenize | |
| def get_hc_summary(output): | |
| i = np.argmax(output["scores"]) | |
| summary = output["summaries"][i] | |
| mask = output["masks"][i] | |
| return summary | |
| def main(args): | |
| outputs = list(utils.read_jsonl(args.outputs)) | |
| dataset = list(utils.read_jsonl(args.dataset)) | |
| all_scores = defaultdict(list) | |
| for i, item in tqdm.tqdm(enumerate(dataset)): | |
| src = item["text"] | |
| if args.lower_src: | |
| src = src.lower() | |
| tgts = item["summaries"] | |
| pred = get_hc_summary(outputs[i]) | |
| if args.max_chars > 0: | |
| pred = pred[:args.max_chars] | |
| src_tokens = word_tokenize(src) | |
| pred_tokens = word_tokenize(pred) | |
| if args.lower_summary: | |
| pred_tokens = [t.lower() for t in pred_tokens] | |
| if args.pretokenized: | |
| src_tokens = src.split() | |
| else: | |
| src_tokens = word_tokenize(src) | |
| item_scores = defaultdict(list) | |
| for tgt in tgts: | |
| if args.pretokenized: | |
| tgt_tokens = tgt.split() | |
| else: | |
| tgt_tokens = word_tokenize(tgt) | |
| if args.lower_summary: | |
| tgt_tokens = [t.lower() for t in tgt_tokens] | |
| token_fscore = compute_token_f1(tgt_tokens, pred_tokens, use_counts=True) | |
| rouge_scores = rouge_scorer.score(tgt, pred) | |
| for rouge_type, rouge_type_scores in rouge_scores.items(): | |
| item_scores[f"{rouge_type}-p"].append(rouge_type_scores.precision) | |
| item_scores[f"{rouge_type}-r"].append(rouge_type_scores.recall) | |
| item_scores[f"{rouge_type}-f"].append(rouge_type_scores.fmeasure) | |
| item_scores["token-f1"].append(token_fscore) | |
| item_scores["tgt-len"].append(len(tgt_tokens)) | |
| item_scores["tgt-cr"].append(len(tgt_tokens) / len(src_tokens)) | |
| for k, values in item_scores.items(): | |
| item_mean = np.mean(values) | |
| all_scores[k].append(item_mean) | |
| all_scores["pred-len"].append(len(pred_tokens)) | |
| all_scores["src-len"].append(len(src_tokens)) | |
| all_scores["pred-cr"].append(len(pred_tokens) / len(src_tokens)) | |
| if args.verbose: | |
| print("SRC:", src) | |
| print("TGT:", tgts[0]) | |
| print("PRED:", pred) | |
| print("=" * 100) | |
| print("="*100) | |
| print("RESULTS:") | |
| print("="*20, "Length (#tokens):", "="*20) | |
| for metric in ("src-len", "tgt-len", "pred-len"): | |
| mean = np.mean(all_scores[metric]) | |
| print(f"{metric}: {mean:.2f}") | |
| print() | |
| print("="*20, "Compression ratio:", "="*20) | |
| for metric in ("tgt-cr", "pred-cr"): | |
| mean = np.mean(all_scores[metric]) | |
| print(f"{metric}: {mean:.2f}") | |
| print() | |
| print("="*20, "Token F1-Score:", "="*20) | |
| mean = np.mean(all_scores["token-f1"]) | |
| print(f"f1-score: {mean:.3f}") | |
| print() | |
| print("="*20, "ROUGE F1-Scores:", "="*20) | |
| for rouge_type in ROUGE_TYPES: | |
| mean = np.mean(all_scores[f"{rouge_type}-f"]) | |
| print(f"{rouge_type}: {mean:.4f}") | |
| print() | |
| print("="*20, "ROUGE Recall:", "="*20) | |
| for rouge_type in ROUGE_TYPES: | |
| mean = np.mean(all_scores[f"{rouge_type}-r"]) | |
| print(f"{rouge_type}: {mean:.4f}") | |
| print() | |
| def parse_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--dataset', required=True) | |
| parser.add_argument('--outputs', required=True) | |
| parser.add_argument('--pretokenized', action="store_true") | |
| parser.add_argument('--max-chars', type=int, default=-1) | |
| parser.add_argument('--verbose', action="store_true") | |
| parser.add_argument('--lower-src', action="store_true") | |
| parser.add_argument('--lower-summary', action="store_true") | |
| return parser.parse_args() | |
| if __name__ == '__main__': | |
| main(parse_args()) | |