diff --git a/analysis/avg_agreement_final.py b/analysis/avg_agreement_final.py index cb93b89..25ed807 100644 --- a/analysis/avg_agreement_final.py +++ b/analysis/avg_agreement_final.py @@ -2,63 +2,40 @@ import matplotlib.pyplot as plt import numpy as np +FONT_SIZES = {"small": 12, "medium": 16, "large": 18} +COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"} + +PLOT_PARAMS = { + "font.family": "serif", + "font.serif": ["Times New Roman", "STIX"], + "font.size": FONT_SIZES.get("medium"), + "axes.titlesize": FONT_SIZES.get("large"), + "axes.labelsize": FONT_SIZES.get("large"), + "xtick.labelsize": FONT_SIZES.get("large"), + "ytick.labelsize": FONT_SIZES.get("large"), + "legend.fontsize": FONT_SIZES.get("medium"), + "figure.titlesize": FONT_SIZES.get("medium"), + "text.usetex": False, +} + +plt.rcParams.update(PLOT_PARAMS) + + data = { - "meta-llama/Meta-Llama-3.1-8B-Instruct": [ - 0.3533086666014079, - 0.052422082615756406 - ], - "cohere/c4ai-aya-23-35b": [ - 0.43767196047824003, - 0.026040919354464294 - ], - "cohere/c4ai-aya-23-8b": [ - 0.013483014909052663, - 0.03363706833599835 - ], - "cohere/command-r-08-2024": [ - 0.374457668650282, - 0.02926089754079793 - ], - "cohere/command-r-plus-08-2024": [ - 0.3830841816733316, - 0.020185255968455686 - ], - "google/gemma-1.1-7b-it": [ - 0.5190375637539242, - 0.027757722654111305 - ], - "google/gemma-2-9b-it": [ - 0.5181663123111222, - 0.031090119385244894 - ], - "meta-llama/Meta-Llama-3-70B-Instruct": [ - 0.5685224105896568, - 0.04853344616275034 - ], - "meta-llama/Meta-Llama-3-8B-Instruct": [ - 0.37936948540837095, - 0.032172769265151994 - ], - "meta-llama/Meta-Llama-3.1-70B-Instruct": [ - 0.603536768244583, - 0.027191895488989915 - ], - "mistralai/Mistral-7B-Instruct-v0.2": [ - 0.4071166722276529, - 0.04577594028555328 - ], - "mistralai/Mistral-7B-Instruct-v0.3": [ - 0.41195018984687265, - 0.056184679972755454 - ], - "openai/gpt-4-turbo-2024-04-09": [ - 0.6106943361444249, - 0.02932446842558468 - ], - "openai/gpt-4o-2024-05-13": [ - 0.5833874065757011, - 0.023695391445384514 - ] + "LlaMa 3.1 8B": [0.3533086666014079, 0.052422082615756406], + "Aya 23 35B": [0.43767196047824003, 0.026040919354464294], + # "Aya 23 8B": [0.013483014909052663, 0.03363706833599835], + "Command R": [0.374457668650282, 0.02926089754079793], + "Command R+": [0.3830841816733316, 0.020185255968455686], + "Gemma 1.1 7B": [0.5190375637539242, 0.027757722654111305], + "Gemma 2 9B": [0.5181663123111222, 0.031090119385244894], + "LlaMa 3 70B": [0.5685224105896568, 0.04853344616275034], + "LlaMa 3 8B": [0.37936948540837095, 0.032172769265151994], + "LlaMa 3.1 70B": [0.603536768244583, 0.027191895488989915], + "Mistal 7B v0.2": [0.4071166722276529, 0.04577594028555328], + "Mistral 7B v0.3": [0.41195018984687265, 0.056184679972755454], + "GPT-4 Turbo": [0.6106943361444249, 0.02932446842558468], + "GPT-4o": [0.5833874065757011, 0.023695391445384514], } sorted_data = dict(sorted(data.items(), key=lambda item: item[1][0])) @@ -66,27 +43,35 @@ means_sorted = [v[0] for v in sorted_data.values()] std_devs_sorted = [v[1] for v in sorted_data.values()] -sns.set(style="whitegrid") -palette = sns.color_palette("coolwarm", len(labels_sorted)) +# sns.set(style="whitegrid") +# palette = sns.color_palette("coolwarm", len(labels_sorted)) -plt.figure(figsize=(10, 6)) +plt.figure(figsize=(7, 7)) x_pos_sorted = np.arange(len(labels_sorted)) -ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, palette=palette, errorbar=None) -plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt='none', c='black', capsize=5) +ax1 = sns.barplot( + x=x_pos_sorted, + y=means_sorted, + errorbar=None, + color=COLORS.get("orange"), + edgecolor=COLORS.get("green"), +) +plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5) -ax1.spines['top'].set_color('black') -ax1.spines['right'].set_color('black') -ax1.spines['left'].set_color('black') -ax1.spines['bottom'].set_color('black') -for spine in ax1.spines.values(): - spine.set_linewidth(2) # Make the border thicker +# ax1.spines["top"].set_color("black") +# ax1.spines["right"].set_color("black") +# ax1.spines["left"].set_color("black") +# ax1.spines["bottom"].set_color("black") +# for spine in ax1.spines.values(): +# spine.set_linewidth(2) # Make the border thicker +plt.grid(color="gray", axis="y", alpha=0.2) plt.ylim(0, 0.8) +plt.gca().set_axisbelow(True) -plt.xticks(x_pos_sorted, labels_sorted, rotation=90) +plt.xticks(x_pos_sorted, labels_sorted, rotation=45, ha="right") plt.ylabel("Cohen's Kappa") -plt.title('Average Inner-Model Agreement Across Languages') +plt.title("Average Inner-Model Agreement Across Languages") plt.tight_layout() -plt.savefig(f"./innermodel_agreement.pdf", bbox_inches='tight') \ No newline at end of file +plt.savefig("plots/innermodel_agreement_green_oracle.pdf", bbox_inches="tight") diff --git a/analysis/plot_results.py b/analysis/plot_results.py index 2fbb64f..1ec28c4 100644 --- a/analysis/plot_results.py +++ b/analysis/plot_results.py @@ -13,7 +13,7 @@ PLOT_PARAMS = { "font.family": "serif", - "font.serif": ["Times New Roman", "STIX"], + "font.serif": ["Times", "Times New Roman", "STIX"], "font.size": FONT_SIZES.get("medium"), "axes.titlesize": FONT_SIZES.get("large"), "axes.labelsize": FONT_SIZES.get("large"), @@ -66,6 +66,8 @@ "zho": "zh", } +COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"} + def get_args(): # fmt: off @@ -122,6 +124,7 @@ def plot_main_heatmap( df = pd.read_csv(input_path) # Remove unnecessary column df.pop("eng_Latn") + df.pop("Family") df = df.sort_values(by="Avg_Multilingual", ascending=False).head(10).reset_index(drop=True) data = df[[col for col in df.columns if col not in ["Model_Type"]]].rename(columns={"Avg_Multilingual": "Avg"}) @@ -133,14 +136,39 @@ def plot_main_heatmap( data.pop("zho_Hant") data = data[sorted(data.columns)] data.columns = [col.split("_")[0] for col in data.columns] + data["Var"] = data[list(LANG_STANDARDIZATION.keys())].var(axis=1) data = data.rename(columns=LANG_STANDARDIZATION) - fig, ax = plt.subplots(1, 1, figsize=figsize) - sns.heatmap(data, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False) - ax.xaxis.set_ticks_position("top") - ax.tick_params(axis="x") - ax.set_ylabel("") - ax.set_yticklabels([f"{model} " for model in data.index]) + lang_results = data[list(LANG_STANDARDIZATION.values())] + avg = data[["Avg"]] + var = data[["Var"]] + + fig, axs = plt.subplots(ncols=3, figsize=figsize, gridspec_kw={"width_ratios": [0.5, 0.5, 9]}, sharey=True) + cmap = "Greys" + fmt = ".1f" + + sns.heatmap(avg, ax=axs[0], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False) + axs[0].xaxis.set_ticks_position("top") + axs[0].set_xticklabels(avg.columns, fontsize=20) + axs[0].tick_params(axis="x") + axs[0].set_ylabel("") + axs[0].set_yticklabels([f"{model} " for model in avg.index], fontsize=20) + + sns.heatmap(var, ax=axs[1], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False) + axs[1].xaxis.set_ticks_position("top") + axs[1].set_xticklabels(var.columns, fontsize=20) + axs[1].tick_params(axis="x") + axs[1].set_ylabel("") + axs[1].tick_params(axis="y", length=0) + axs[1].set_yticklabels([f"{model} " for model in var.index], fontsize=20) + + sns.heatmap(lang_results, ax=axs[2], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False) + axs[2].xaxis.set_ticks_position("top") + axs[2].set_xticklabels(lang_results.columns, fontsize=20) + axs[2].tick_params(axis="x") + axs[2].tick_params(axis="y", length=0) + axs[2].set_ylabel("") + axs[2].set_yticklabels([f"{model} " for model in lang_results.index], fontsize=20) plt.tight_layout() fig.savefig(output_path, bbox_inches="tight") @@ -155,7 +183,7 @@ def plot_eng_drop_line( from scipy.stats import pearsonr, spearmanr df = pd.read_csv(input_path) - df = df[["Model", "Model_Type", "eng_Latn", "Avg_Multilingual"]] + df = df[["Model", "Model_Type", "Family", "eng_Latn", "Avg_Multilingual"]] df = df.sort_values(by="Avg_Multilingual", ascending=False).reset_index(drop=True) data = df.set_index("Model").dropna() data[data.select_dtypes(include="number").columns] = data.select_dtypes(include="number") * 100 @@ -166,11 +194,19 @@ def plot_eng_drop_line( fig, ax = plt.subplots(figsize=figsize) - colors = ["red", "green", "blue"] + colors = [COLORS.get("green"), COLORS.get("purple"), COLORS.get("orange")] + markers = ["o", "*", "D"] for (label, group), color in zip(data.groupby("Model_Type"), colors): mrewardbench_scores = group["Avg_Multilingual"] rewardbench_scores = group["eng_Latn"] - ax.scatter(rewardbench_scores, mrewardbench_scores, marker="o", s=40, label=label, color=color) + ax.scatter( + rewardbench_scores, + mrewardbench_scores, + marker="o", + s=60, + label=label, + color=color, + ) mrewardbench_scores = data["Avg_Multilingual"] rewardbench_scores = data["eng_Latn"] @@ -188,22 +224,23 @@ def plot_eng_drop_line( ax.set_aspect("equal") ax.legend(frameon=False, handletextpad=0.2, fontsize=12) - model_names = [MODEL_STANDARDIZATION[model] for model in data.index] - texts = [ - ax.text( - rewardbench_scores[idx], - mrewardbench_scores[idx], - model_names[idx], - fontsize=14, + if top_n: + model_names = [MODEL_STANDARDIZATION[model] for model in data.index] + texts = [ + ax.text( + rewardbench_scores[idx], + mrewardbench_scores[idx], + model_names[idx], + fontsize=14, + ) + for idx in range(len(data)) + ] + adjust_text( + texts, + ax=ax, + # force_static=0.15, + arrowprops=dict(arrowstyle="->", color="gray"), ) - for idx in range(len(data)) - ] - adjust_text( - texts, - ax=ax, - # force_static=0.15, - arrowprops=dict(arrowstyle="->", color="gray"), - ) # ax.text( # 0.6, @@ -270,7 +307,8 @@ def plot_ling_dims( y=dim, data=lingdf, ax=ax, - color="green", + color=COLORS.get("orange"), + edgecolor=COLORS.get("green"), width=0.4 if dim == "Resource Availability" else 0.7, ) ax.set_title(dim)