Spaces:
Build error
Build error
| import os | |
| import re | |
| import pandas as pd | |
| import evaluate | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from datasets import load_dataset | |
| from langchain_openai import ChatOpenAI | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from tqdm import tqdm | |
| from eval_modules.calc_repetitions import * | |
| from llm_toolkit.llm_utils import load_tokenizer | |
| print(f"loading {__file__}") | |
| bleu = evaluate.load("bleu") | |
| rouge = evaluate.load("rouge") | |
| meteor = evaluate.load("meteor") | |
| accuracy = evaluate.load("accuracy") | |
| def extract_answer(text, debug=False): | |
| if text: | |
| # Remove the begin and end tokens | |
| text = re.sub( | |
| r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE | |
| ) | |
| if debug: | |
| print("--------\nstep 1:", text) | |
| text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE) | |
| if debug: | |
| print("--------\nstep 2:", text) | |
| text = re.sub( | |
| r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE | |
| ) | |
| if debug: | |
| print("--------\nstep 3:", text) | |
| return text | |
| def calc_metrics(references, predictions, debug=False): | |
| assert len(references) == len( | |
| predictions | |
| ), f"lengths are difference: {len(references)} != {len(predictions)}" | |
| predictions = [extract_answer(text) for text in predictions] | |
| results = {} | |
| results["meteor"] = meteor.compute(predictions=predictions, references=references)[ | |
| "meteor" | |
| ] | |
| results["bleu_scores"] = bleu.compute( | |
| predictions=predictions, references=references, max_order=4 | |
| ) | |
| results["rouge_scores"] = rouge.compute( | |
| predictions=predictions, references=references | |
| ) | |
| correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)] | |
| accuracy = sum(correct) / len(references) | |
| results["accuracy"] = accuracy | |
| if debug: | |
| correct_ids = [i for i, c in enumerate(correct) if c == 1] | |
| results["correct_ids"] = correct_ids | |
| return results | |
| def save_results(model_name, results_path, dataset, predictions, debug=False): | |
| if not os.path.exists(results_path): | |
| # Get the directory part of the file path | |
| dir_path = os.path.dirname(results_path) | |
| # Create all directories in the path (if they don't exist) | |
| os.makedirs(dir_path, exist_ok=True) | |
| df = dataset.to_pandas() | |
| df.drop(columns=["text", "prompt"], inplace=True) | |
| else: | |
| df = pd.read_csv(results_path, on_bad_lines="warn") | |
| df[model_name] = predictions | |
| if debug: | |
| print(df.head(1)) | |
| df.to_csv(results_path, index=False) | |
| def load_translation_dataset(data_path, tokenizer=None): | |
| train_data_file = data_path.replace(".tsv", "-train.tsv") | |
| test_data_file = data_path.replace(".tsv", "-test.tsv") | |
| if not os.path.exists(train_data_file): | |
| print("generating train/test data files") | |
| dataset = load_dataset( | |
| "csv", data_files=data_path, delimiter="\t", split="train" | |
| ) | |
| print(len(dataset)) | |
| dataset = dataset.filter(lambda x: x["chinese"] and x["english"]) | |
| datasets = dataset.train_test_split(test_size=0.2) | |
| print(len(dataset)) | |
| # Convert to pandas DataFrame | |
| train_df = pd.DataFrame(datasets["train"]) | |
| test_df = pd.DataFrame(datasets["test"]) | |
| # Save to TSV | |
| train_df.to_csv(train_data_file, sep="\t", index=False) | |
| test_df.to_csv(test_data_file, sep="\t", index=False) | |
| print("loading train/test data files") | |
| datasets = load_dataset( | |
| "csv", | |
| data_files={"train": train_data_file, "test": test_data_file}, | |
| delimiter="\t", | |
| ) | |
| if tokenizer: | |
| translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}" | |
| def formatting_prompts_func(examples): | |
| inputs = examples["chinese"] | |
| outputs = examples["english"] | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": "You are an expert in translating Chinese to English.", | |
| }, | |
| None, | |
| ] | |
| model_name = os.getenv("MODEL_NAME") | |
| # if "mistral" in model_name.lower(): | |
| # messages = messages[1:] | |
| texts = [] | |
| prompts = [] | |
| for input, output in zip(inputs, outputs): | |
| prompt = translation_prompt.format(input) | |
| messages[-1] = {"role": "user", "content": prompt} | |
| prompt = tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| prompts.append(prompt) | |
| texts.append(prompt + output + tokenizer.eos_token) | |
| return {"text": texts, "prompt": prompts} | |
| datasets = datasets.map( | |
| formatting_prompts_func, | |
| batched=True, | |
| ) | |
| print(datasets) | |
| return datasets | |
| def count_entries_with_max_tokens(entries, max_tokens): | |
| """ | |
| Count the number of entries with the max output tokens or more. | |
| Parameters: | |
| entries (list of int): List of token counts for each entry. | |
| max_tokens (int): The maximum token threshold. | |
| Returns: | |
| int: The number of entries with token counts greater than or equal to max_tokens. | |
| """ | |
| count = 0 | |
| for tokens in entries: | |
| if tokens >= max_tokens: | |
| count += 1 | |
| return count | |
| def detect_repetition_scores(row, col, debug=False): | |
| # print(f"row: {row}") | |
| newline_score, repetition_score, total_repetitions = detect_repetitions( | |
| row[col], debug=debug | |
| ) | |
| newline_score -= row["ground_truth_ews_score"] | |
| repetition_score -= row["ground_truth_repetition_score"] | |
| total_repetitions -= row["ground_truth_total_repetitions"] | |
| return pd.Series( | |
| [ | |
| newline_score if newline_score > 0 else 0, | |
| repetition_score if repetition_score > 0 else 0, | |
| total_repetitions if total_repetitions > 0 else 0, | |
| ] | |
| ) | |
| def get_metrics(df, max_output_tokens=2048): | |
| metrics_df = pd.DataFrame(df.columns.T)[2:] | |
| metrics_df.rename(columns={0: "model"}, inplace=True) | |
| metrics_df["rpp"] = metrics_df["model"].apply(lambda x: x.split("rpp-")[-1]) | |
| metrics_df["model"] = metrics_df["model"].apply(lambda x: x.split("/rpp-")[0]) | |
| metrics_df.reset_index(inplace=True) | |
| metrics_df = metrics_df.drop(columns=["index"]) | |
| tokenizers = { | |
| model: load_tokenizer(model) for model in metrics_df["model"].unique() | |
| } | |
| meteor = [] | |
| bleu_1 = [] | |
| rouge_l = [] | |
| ews_score = [] | |
| repetition_score = [] | |
| total_repetitions = [] | |
| num_max_output_tokens = [] | |
| columns = df.columns[2:] | |
| df[ | |
| [ | |
| "ground_truth_ews_score", | |
| "ground_truth_repetition_score", | |
| "ground_truth_total_repetitions", | |
| ] | |
| ] = df["english"].apply(detect_scores) | |
| for col in columns: | |
| metrics = calc_metrics(df["english"], df[col], debug=True) | |
| print(f"{col}: {metrics}") | |
| meteor.append(metrics["meteor"]) | |
| bleu_1.append(metrics["bleu_scores"]["bleu"]) | |
| rouge_l.append(metrics["rouge_scores"]["rougeL"]) | |
| df[["ews_score", "repetition_score", "total_repetitions"]] = df.apply( | |
| lambda x: detect_repetition_scores(x, col), axis=1 | |
| ) | |
| ews_score.append(df["ews_score"].mean()) | |
| repetition_score.append(df["repetition_score"].mean()) | |
| total_repetitions.append(df["total_repetitions"].mean()) | |
| model = col.split("/rpp")[0] | |
| new_col = f"ground_truth_tokens-{model}" | |
| df[new_col] = df["english"].apply( | |
| lambda x: len(tokenizers[model](x)["input_ids"]) | |
| ) | |
| new_col = f"output_tokens-{col}" | |
| df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"])) | |
| num_max_output_tokens.append( | |
| count_entries_with_max_tokens(df[new_col], max_output_tokens) | |
| ) | |
| metrics_df["meteor"] = meteor | |
| metrics_df["bleu_1"] = bleu_1 | |
| metrics_df["rouge_l"] = rouge_l | |
| metrics_df["ews_score"] = ews_score | |
| metrics_df["repetition_score"] = repetition_score | |
| metrics_df["total_repetitions"] = total_repetitions | |
| metrics_df["rap"] = metrics_df.apply( | |
| lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1 | |
| ) | |
| metrics_df["num_max_output_tokens"] = num_max_output_tokens | |
| return metrics_df | |
| def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)): | |
| plt.figure(figsize=figsize) | |
| df_melted = pd.melt( | |
| metrics_df, id_vars="model", value_vars=["meteor", "bleu_1", "rouge_l"] | |
| ) | |
| barplot = sns.barplot(x="variable", y="value", hue="model", data=df_melted) | |
| # Set different hatches for each model | |
| hatches = ["/", "\\", "|", "-", "+", "x", "o", "O", ".", "*", "//", "\\\\"] | |
| # Create a dictionary to map models to hatches | |
| model_hatches = { | |
| model: hatches[i % len(hatches)] | |
| for i, model in enumerate(metrics_df["model"].unique()) | |
| } | |
| # Apply hatches based on the model | |
| num_vars = len(df_melted["variable"].unique()) | |
| for i, bar in enumerate(barplot.patches): | |
| model = df_melted["model"].iloc[i // num_vars] | |
| bar.set_hatch(model_hatches[model]) | |
| # Manually update legend to match the bar hatches | |
| handles, labels = barplot.get_legend_handles_labels() | |
| for handle, model in zip(handles, metrics_df["model"].unique()): | |
| handle.set_hatch(model_hatches[model]) | |
| barplot.set_xticklabels(["METEOR", "BLEU-1", "ROUGE-L"]) | |
| for p in barplot.patches: | |
| if p.get_height() == 0: | |
| continue | |
| barplot.annotate( | |
| f"{p.get_height():.2f}", | |
| (p.get_x() + p.get_width() / 2.0, p.get_height()), | |
| ha="center", | |
| va="center", | |
| xytext=(0, 10), | |
| textcoords="offset points", | |
| ) | |
| barplot.set(ylim=ylim, ylabel="Scores", xlabel="Metrics") | |
| plt.legend(bbox_to_anchor=(0.5, -0.1), loc="upper center") | |
| plt.show() | |
| def plot_times(perf_df, ylim=0.421): | |
| # Adjusted code to put "train-time" bars in red at the bottom | |
| fig, ax1 = plt.subplots(figsize=(12, 10)) | |
| color_train = "tab:red" | |
| color_eval = "orange" | |
| ax1.set_xlabel("Models") | |
| ax1.set_ylabel("Time (mins)") | |
| ax1.set_xticks(range(len(perf_df["model"]))) # Set x-ticks positions | |
| ax1.set_xticklabels(perf_df["model"], rotation=90) | |
| # Plot "train-time" first so it's at the bottom | |
| ax1.bar( | |
| perf_df["model"], | |
| perf_df["train-time(mins)"], | |
| color=color_train, | |
| label="train-time", | |
| ) | |
| # Then, plot "eval-time" on top of "train-time" | |
| ax1.bar( | |
| perf_df["model"], | |
| perf_df["eval-time(mins)"], | |
| bottom=perf_df["train-time(mins)"], | |
| color=color_eval, | |
| label="eval-time", | |
| ) | |
| ax1.tick_params(axis="y") | |
| ax1.legend(loc="upper left") | |
| if "meteor" in perf_df.columns: | |
| ax2 = ax1.twinx() | |
| color_meteor = "tab:blue" | |
| ax2.set_ylabel("METEOR", color=color_meteor) | |
| ax2.plot( | |
| perf_df["model"], | |
| perf_df["meteor"], | |
| color=color_meteor, | |
| marker="o", | |
| label="meteor", | |
| ) | |
| ax2.tick_params(axis="y", labelcolor=color_meteor) | |
| ax2.legend(loc="upper right") | |
| ax2.set_ylim(ax2.get_ylim()[0], ylim) | |
| # Show numbers in bars | |
| for p in ax1.patches: | |
| height = p.get_height() | |
| if height == 0: # Skip bars with height 0 | |
| continue | |
| ax1.annotate( | |
| f"{height:.2f}", | |
| (p.get_x() + p.get_width() / 2.0, p.get_y() + height), | |
| ha="center", | |
| va="center", | |
| xytext=(0, -10), | |
| textcoords="offset points", | |
| ) | |
| fig.tight_layout() | |
| plt.show() | |
| def translate_via_llm(text): | |
| base_url = os.getenv("OPENAI_BASE_URL") or "http://localhost:8000/v1" | |
| llm = ChatOpenAI( | |
| model="gpt-4o", | |
| temperature=0, | |
| max_tokens=None, | |
| timeout=None, | |
| max_retries=2, | |
| base_url=base_url, | |
| ) | |
| prompt = ChatPromptTemplate.from_messages( | |
| [ | |
| ( | |
| "human", | |
| "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{input}", | |
| ), | |
| ] | |
| ) | |
| chain = prompt | llm | |
| response = chain.invoke( | |
| { | |
| "input": text, | |
| } | |
| ) | |
| return response.content | |
| def translate(text, cache_dict): | |
| if text in cache_dict: | |
| return cache_dict[text] | |
| else: | |
| translated_text = translate_via_llm(text) | |
| cache_dict[text] = translated_text | |
| return translated_text | |