From 64c72c0282f5cf3f0a988844d3119199499433e5 Mon Sep 17 00:00:00 2001 From: Lj Miranda <12949683+ljvmiranda921@users.noreply.github.com> Date: Sat, 12 Oct 2024 21:59:04 -0700 Subject: [PATCH] Add plot for NLLB vs Google Translate (#48) --- analysis/_plot_leaderboard.py | 7 ++- analysis/avg_agreement_final.py | 2 +- analysis/maple_results.py | 105 +++++++++++++++----------------- analysis/plot_results.py | 83 +++++++++++++++++++++++-- 4 files changed, 131 insertions(+), 66 deletions(-) diff --git a/analysis/_plot_leaderboard.py b/analysis/_plot_leaderboard.py index 985d6f0..bac25f6 100644 --- a/analysis/_plot_leaderboard.py +++ b/analysis/_plot_leaderboard.py @@ -3,12 +3,12 @@ from pathlib import Path from typing import Optional +import matplotlib.pyplot as plt import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt from huggingface_hub import snapshot_download -from analysis.plot_utils import get_scores, PLOT_PARAMS +from analysis.plot_utils import PLOT_PARAMS, get_scores logging.basicConfig(level=logging.INFO) @@ -99,7 +99,8 @@ def main(): output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.png" csv_output_file = output_dir / f"leaderboard-{model_type.replace(' ', '_')}.csv" data_to_cache = data.copy(deep=True) - data_to_cache["eng_Latn"] = model_type_df["eng_Latn"] + if "eng_Latn" in model_type_df.columns: + data_to_cache["eng_Latn"] = model_type_df["eng_Latn"] data_to_cache = data_to_cache.rename(columns={"Avg": "Avg_Multilingual"}) data_to_cache.to_csv(csv_output_file) fig.savefig(output_file, dpi=120) diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index 25ed807..b1bd0b1 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -1,6 +1,6 @@ -import seaborn as sns import matplotlib.pyplot as plt import numpy as np +import seaborn as sns FONT_SIZES = {"small": 12, "medium": 16, "large": 18} COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"} diff --git a/analysis/maple_results.py b/analysis/maple_results.py index 45ee3d0..0817159 100644 --- a/analysis/maple_results.py +++ b/analysis/maple_results.py @@ -1,92 +1,87 @@ -import json -from pathlib import Path - import argparse +import json import logging +from collections import defaultdict +from itertools import combinations from pathlib import Path from typing import Optional +import datasets +import matplotlib.pyplot as plt +import numpy as np import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt from huggingface_hub import snapshot_download -import datasets -import json - -import numpy as np -import matplotlib.pyplot as plt -from itertools import combinations -from collections import defaultdict - FONT_SIZES = {"small": 12, "medium": 16, "large": 18} PLOT_PARAMS = { - "font.family": "serif", - "font.serif": ["Times New Roman", "STIX"], - "font.size": FONT_SIZES.get("medium"), - "axes.titlesize": FONT_SIZES.get("large"), - "axes.labelsize": FONT_SIZES.get("large"), - "xtick.labelsize": FONT_SIZES.get("large"), - "ytick.labelsize": FONT_SIZES.get("small"), - "legend.fontsize": FONT_SIZES.get("medium"), - "figure.titlesize": FONT_SIZES.get("medium"), - "text.usetex": False, + "font.family": "serif", + "font.serif": ["Times New Roman", "STIX"], + "font.size": FONT_SIZES.get("medium"), + "axes.titlesize": FONT_SIZES.get("large"), + "axes.labelsize": FONT_SIZES.get("large"), + "xtick.labelsize": FONT_SIZES.get("large"), + "ytick.labelsize": FONT_SIZES.get("small"), + "legend.fontsize": FONT_SIZES.get("medium"), + "figure.titlesize": FONT_SIZES.get("medium"), + "text.usetex": False, } logging.basicConfig(level=logging.INFO) plt.rcParams.update(PLOT_PARAMS) + def load_json(json_file_path): - with open(json_file_path, "r") as file: - json_data = json.load(file) - return json_data + with open(json_file_path, "r") as file: + json_data = json.load(file) + return json_data -results_dir = 'data/eval-results-maple' + +results_dir = "data/eval-results-maple" results_path = Path(results_dir) results_all = [] for result_file in results_path.glob("*.json"): - raw_results = load_json(result_file) - if "leaderboard" in raw_results.keys(): - model_id = raw_results["model"] - subset_results = raw_results['subset'] - overall = raw_results['scores']['accuracy'] - remove_key = ['model', 'model_type', 'chat_template'] - for key in remove_key: - del subset_results[key] - elif "subset_results" in raw_results.keys(): - model_id = raw_results["model"] - subset_results = raw_results['subset_results'] - overall = raw_results['accuracy'] - else: - model_id = raw_results["model"] - subset_results = raw_results['extra_results'] - overall = raw_results['accuracy'] - # print(model_id, overall) - # print("\t", subset_results) - # results_all.append([model_id, overall, subset_results]) - results_all.append({'Model': model_id, 'Avg': overall, **subset_results}) - - # import ipdb; ipdb.set_trace() - -TOP = 10 + raw_results = load_json(result_file) + if "leaderboard" in raw_results.keys(): + model_id = raw_results["model"] + subset_results = raw_results["subset"] + overall = raw_results["scores"]["accuracy"] + remove_key = ["model", "model_type", "chat_template"] + for key in remove_key: + del subset_results[key] + elif "subset_results" in raw_results.keys(): + model_id = raw_results["model"] + subset_results = raw_results["subset_results"] + overall = raw_results["accuracy"] + else: + model_id = raw_results["model"] + subset_results = raw_results["extra_results"] + overall = raw_results["accuracy"] + # print(model_id, overall) + # print("\t", subset_results) + # results_all.append([model_id, overall, subset_results]) + results_all.append({"Model": model_id, "Avg": overall, **subset_results}) + + # import ipdb; ipdb.set_trace() + +TOP = 10 # results_all.sort(key=lambda x: x[1], reverse=True) # results_all = results_all[:TOP] # print(results_all) df_results = pd.DataFrame(results_all) -df_results = df_results.sort_values(by='Avg', ascending=False).reset_index(drop=True) +df_results = df_results.sort_values(by="Avg", ascending=False).reset_index(drop=True) df_results = df_results.head(10).reset_index(drop=True) -df_results.columns = df_results.columns.str.replace('^maple-', '', regex=True) +df_results.columns = df_results.columns.str.replace("^maple-", "", regex=True) df_results = df_results.set_index("Model") df_results = df_results * 100 fig, ax = plt.subplots(1, 1, figsize=(18, 5)) -sns.heatmap(df_results, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, - fmt=".1f", cbar=False) +sns.heatmap(df_results, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".1f", cbar=False) ax.xaxis.set_ticks_position("top") ax.tick_params(axis="x", labelrotation=45) @@ -97,5 +92,3 @@ def load_json(json_file_path): plt.savefig("plots/maple.pdf", bbox_inches="tight") # import ipdb; ipdb.set_trace() - - diff --git a/analysis/plot_results.py b/analysis/plot_results.py index 1ec28c4..ae885e8 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -1,12 +1,12 @@ import argparse import logging -from pathlib import Path from inspect import signature +from pathlib import Path from typing import Optional +import matplotlib.pyplot as plt import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt from adjustText import adjust_text FONT_SIZES = {"small": 12, "medium": 16, "large": 18} @@ -90,6 +90,11 @@ def get_args(): parser_ling_dims.add_argument("--input_path", type=Path, required=True, help="Path to the results file.") parser_ling_dims.add_argument("--langdata", type=Path, required=True, help="Path to the language data file.") parser_ling_dims.add_argument("--top_n", type=int, required=False, default=None, help="Aggregate only the scores for top-n.") + + parser_translate = subparsers.add_parser("translate", help="Plot translation quality.", parents=[shared_args]) + parser_translate.add_argument("--gtrans", type=Path, required=True, help="Path to the Google Translate results file.") + parser_translate.add_argument("--nllb", type=Path, required=True, help="Path to the NLLB-3.3B results file.") + # fmt: on return parser.parse_args() @@ -101,6 +106,7 @@ def main(): "main_heatmap": plot_main_heatmap, "eng_drop_line": plot_eng_drop_line, "ling_dims": plot_ling_dims, + "translate": plot_translate, } def _filter_args(func, kwargs): @@ -254,8 +260,6 @@ def plot_eng_drop_line( # # bbox=dict(facecolor="white", edgecolor="black", boxstyle="round,pad=0.5"), # ) - # ax.spines["right"].set_visible(False) - # ax.spines["top"].set_visible(False) plt.tight_layout() fig.savefig(output_path, bbox_inches="tight") @@ -316,8 +320,75 @@ def plot_ling_dims( ax.set_ylabel("") ax.set_xlabel("M-RewardBench Score") - # ax.spines["right"].set_visible(False) - # ax.spines["top"].set_visible(False) + plt.tight_layout() + fig.savefig(output_path, bbox_inches="tight") + + +def plot_translate( + gtrans: Path, + nllb: Path, + output_path: Path, + figsize: Optional[tuple[int, int]] = (18, 5), +): + columns = ["Model", "Model_Type", "Avg_Multilingual"] + gtrans_df = pd.read_csv(gtrans)[columns].rename(columns={"Avg_Multilingual": "Avg_Gtrans"}) + nllb_df = pd.read_csv(nllb)[columns].rename(columns={"Avg_Multilingual": "Avg_NLLB"}) + + combined = nllb_df.merge(gtrans_df, how="left", on="Model") + combined = combined[["Model", "Avg_NLLB", "Avg_Gtrans", "Model_Type_x"]].rename( + columns={"Model_Type_x": "Model_Type"} + ) + + print(combined.sort_values(by="Avg_NLLB", ascending=False)) + + colors = { + "Sequence Classifier": COLORS.get("green"), + "Generative RM": COLORS.get("purple"), + "DPO": COLORS.get("orange"), + } + + labels = { + "Sequence Classifier": "Classifier RM", + "Generative RM": "Generative RM", + "DPO": "Implicit RM", + } + + fig, ax = plt.subplots(figsize=figsize) + for _, row in combined.iterrows(): + ax.plot( + [1, 2], + [row["Avg_NLLB"], row["Avg_Gtrans"]], + marker="o", + color=colors[row["Model_Type"]], + label=labels[row["Model_Type"]], + ) + + # Avoid duplicate labels in the legend + handles, labels = plt.gca().get_legend_handles_labels() + by_label = dict(zip(labels, handles)) + ax.legend( + by_label.values(), + by_label.keys(), + frameon=False, + ncols=3, + loc="lower center", + bbox_to_anchor=(0.5, -0.2), + ) + + # ax.grid(color="gray", alpha=0.2, which="both", axis="x") + # ax.set_ylabel("M-RewardBench Overall Score") + + ax.set_xticks([1, 2]) + ax.set_xticklabels(["NLLB", "Google Translate"]) + ax.yaxis.set_visible(False) + + ax.spines[["top", "bottom", "left", "right"]].set_visible(False) + ax.vlines( + [1, 2], + ymin=combined[["Avg_NLLB", "Avg_Gtrans"]].min().min(), + ymax=combined[["Avg_NLLB", "Avg_Gtrans"]].max().max(), + colors="gray", + ) plt.tight_layout() fig.savefig(output_path, bbox_inches="tight")