Skip to content

Commit

Permalink
Update all charts (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
ljvmiranda921 authored Oct 13, 2024
1 parent a8222bb commit 8b8fa99
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 96 deletions.
125 changes: 55 additions & 70 deletions analysis/avg_agreement_final.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,91 +2,76 @@
import matplotlib.pyplot as plt
import numpy as np

FONT_SIZES = {"small": 12, "medium": 16, "large": 18}
COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"}

PLOT_PARAMS = {
"font.family": "serif",
"font.serif": ["Times New Roman", "STIX"],
"font.size": FONT_SIZES.get("medium"),
"axes.titlesize": FONT_SIZES.get("large"),
"axes.labelsize": FONT_SIZES.get("large"),
"xtick.labelsize": FONT_SIZES.get("large"),
"ytick.labelsize": FONT_SIZES.get("large"),
"legend.fontsize": FONT_SIZES.get("medium"),
"figure.titlesize": FONT_SIZES.get("medium"),
"text.usetex": False,
}

plt.rcParams.update(PLOT_PARAMS)


data = {
"meta-llama/Meta-Llama-3.1-8B-Instruct": [
0.3533086666014079,
0.052422082615756406
],
"cohere/c4ai-aya-23-35b": [
0.43767196047824003,
0.026040919354464294
],
"cohere/c4ai-aya-23-8b": [
0.013483014909052663,
0.03363706833599835
],
"cohere/command-r-08-2024": [
0.374457668650282,
0.02926089754079793
],
"cohere/command-r-plus-08-2024": [
0.3830841816733316,
0.020185255968455686
],
"google/gemma-1.1-7b-it": [
0.5190375637539242,
0.027757722654111305
],
"google/gemma-2-9b-it": [
0.5181663123111222,
0.031090119385244894
],
"meta-llama/Meta-Llama-3-70B-Instruct": [
0.5685224105896568,
0.04853344616275034
],
"meta-llama/Meta-Llama-3-8B-Instruct": [
0.37936948540837095,
0.032172769265151994
],
"meta-llama/Meta-Llama-3.1-70B-Instruct": [
0.603536768244583,
0.027191895488989915
],
"mistralai/Mistral-7B-Instruct-v0.2": [
0.4071166722276529,
0.04577594028555328
],
"mistralai/Mistral-7B-Instruct-v0.3": [
0.41195018984687265,
0.056184679972755454
],
"openai/gpt-4-turbo-2024-04-09": [
0.6106943361444249,
0.02932446842558468
],
"openai/gpt-4o-2024-05-13": [
0.5833874065757011,
0.023695391445384514
]
"LlaMa 3.1 8B": [0.3533086666014079, 0.052422082615756406],
"Aya 23 35B": [0.43767196047824003, 0.026040919354464294],
# "Aya 23 8B": [0.013483014909052663, 0.03363706833599835],
"Command R": [0.374457668650282, 0.02926089754079793],
"Command R+": [0.3830841816733316, 0.020185255968455686],
"Gemma 1.1 7B": [0.5190375637539242, 0.027757722654111305],
"Gemma 2 9B": [0.5181663123111222, 0.031090119385244894],
"LlaMa 3 70B": [0.5685224105896568, 0.04853344616275034],
"LlaMa 3 8B": [0.37936948540837095, 0.032172769265151994],
"LlaMa 3.1 70B": [0.603536768244583, 0.027191895488989915],
"Mistal 7B v0.2": [0.4071166722276529, 0.04577594028555328],
"Mistral 7B v0.3": [0.41195018984687265, 0.056184679972755454],
"GPT-4 Turbo": [0.6106943361444249, 0.02932446842558468],
"GPT-4o": [0.5833874065757011, 0.023695391445384514],
}

sorted_data = dict(sorted(data.items(), key=lambda item: item[1][0]))
labels_sorted = list(sorted_data.keys())
means_sorted = [v[0] for v in sorted_data.values()]
std_devs_sorted = [v[1] for v in sorted_data.values()]

sns.set(style="whitegrid")
palette = sns.color_palette("coolwarm", len(labels_sorted))
# sns.set(style="whitegrid")
# palette = sns.color_palette("coolwarm", len(labels_sorted))

plt.figure(figsize=(10, 6))
plt.figure(figsize=(7, 7))
x_pos_sorted = np.arange(len(labels_sorted))

ax1 = sns.barplot(x=x_pos_sorted, y=means_sorted, palette=palette, errorbar=None)
plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt='none', c='black', capsize=5)
ax1 = sns.barplot(
x=x_pos_sorted,
y=means_sorted,
errorbar=None,
color=COLORS.get("orange"),
edgecolor=COLORS.get("green"),
)
plt.errorbar(x_pos_sorted, means_sorted, yerr=std_devs_sorted, fmt="none", c="black", capsize=5)

ax1.spines['top'].set_color('black')
ax1.spines['right'].set_color('black')
ax1.spines['left'].set_color('black')
ax1.spines['bottom'].set_color('black')
for spine in ax1.spines.values():
spine.set_linewidth(2) # Make the border thicker
# ax1.spines["top"].set_color("black")
# ax1.spines["right"].set_color("black")
# ax1.spines["left"].set_color("black")
# ax1.spines["bottom"].set_color("black")
# for spine in ax1.spines.values():
# spine.set_linewidth(2) # Make the border thicker
plt.grid(color="gray", axis="y", alpha=0.2)

plt.ylim(0, 0.8)
plt.gca().set_axisbelow(True)

plt.xticks(x_pos_sorted, labels_sorted, rotation=90)
plt.xticks(x_pos_sorted, labels_sorted, rotation=45, ha="right")
plt.ylabel("Cohen's Kappa")
plt.title('Average Inner-Model Agreement Across Languages')
plt.title("Average Inner-Model Agreement Across Languages")

plt.tight_layout()
plt.savefig(f"./innermodel_agreement.pdf", bbox_inches='tight')
plt.savefig("plots/innermodel_agreement_green_oracle.pdf", bbox_inches="tight")
90 changes: 64 additions & 26 deletions analysis/plot_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

PLOT_PARAMS = {
"font.family": "serif",
"font.serif": ["Times New Roman", "STIX"],
"font.serif": ["Times", "Times New Roman", "STIX"],
"font.size": FONT_SIZES.get("medium"),
"axes.titlesize": FONT_SIZES.get("large"),
"axes.labelsize": FONT_SIZES.get("large"),
Expand Down Expand Up @@ -66,6 +66,8 @@
"zho": "zh",
}

COLORS = {"green": "#355145", "purple": "#d8a6e5", "orange": "#fe7759"}


def get_args():
# fmt: off
Expand Down Expand Up @@ -122,6 +124,7 @@ def plot_main_heatmap(
df = pd.read_csv(input_path)
# Remove unnecessary column
df.pop("eng_Latn")
df.pop("Family")

df = df.sort_values(by="Avg_Multilingual", ascending=False).head(10).reset_index(drop=True)
data = df[[col for col in df.columns if col not in ["Model_Type"]]].rename(columns={"Avg_Multilingual": "Avg"})
Expand All @@ -133,14 +136,39 @@ def plot_main_heatmap(
data.pop("zho_Hant")
data = data[sorted(data.columns)]
data.columns = [col.split("_")[0] for col in data.columns]
data["Var"] = data[list(LANG_STANDARDIZATION.keys())].var(axis=1)
data = data.rename(columns=LANG_STANDARDIZATION)

fig, ax = plt.subplots(1, 1, figsize=figsize)
sns.heatmap(data, ax=ax, cmap="YlGn", annot=True, annot_kws={"size": 16}, fmt=".2f", cbar=False)
ax.xaxis.set_ticks_position("top")
ax.tick_params(axis="x")
ax.set_ylabel("")
ax.set_yticklabels([f"{model} " for model in data.index])
lang_results = data[list(LANG_STANDARDIZATION.values())]
avg = data[["Avg"]]
var = data[["Var"]]

fig, axs = plt.subplots(ncols=3, figsize=figsize, gridspec_kw={"width_ratios": [0.5, 0.5, 9]}, sharey=True)
cmap = "Greys"
fmt = ".1f"

sns.heatmap(avg, ax=axs[0], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
axs[0].xaxis.set_ticks_position("top")
axs[0].set_xticklabels(avg.columns, fontsize=20)
axs[0].tick_params(axis="x")
axs[0].set_ylabel("")
axs[0].set_yticklabels([f"{model} " for model in avg.index], fontsize=20)

sns.heatmap(var, ax=axs[1], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
axs[1].xaxis.set_ticks_position("top")
axs[1].set_xticklabels(var.columns, fontsize=20)
axs[1].tick_params(axis="x")
axs[1].set_ylabel("")
axs[1].tick_params(axis="y", length=0)
axs[1].set_yticklabels([f"{model} " for model in var.index], fontsize=20)

sns.heatmap(lang_results, ax=axs[2], cmap=cmap, annot=True, annot_kws={"size": 16}, fmt=fmt, cbar=False)
axs[2].xaxis.set_ticks_position("top")
axs[2].set_xticklabels(lang_results.columns, fontsize=20)
axs[2].tick_params(axis="x")
axs[2].tick_params(axis="y", length=0)
axs[2].set_ylabel("")
axs[2].set_yticklabels([f"{model} " for model in lang_results.index], fontsize=20)

plt.tight_layout()
fig.savefig(output_path, bbox_inches="tight")
Expand All @@ -155,7 +183,7 @@ def plot_eng_drop_line(
from scipy.stats import pearsonr, spearmanr

df = pd.read_csv(input_path)
df = df[["Model", "Model_Type", "eng_Latn", "Avg_Multilingual"]]
df = df[["Model", "Model_Type", "Family", "eng_Latn", "Avg_Multilingual"]]
df = df.sort_values(by="Avg_Multilingual", ascending=False).reset_index(drop=True)
data = df.set_index("Model").dropna()
data[data.select_dtypes(include="number").columns] = data.select_dtypes(include="number") * 100
Expand All @@ -166,11 +194,19 @@ def plot_eng_drop_line(

fig, ax = plt.subplots(figsize=figsize)

colors = ["red", "green", "blue"]
colors = [COLORS.get("green"), COLORS.get("purple"), COLORS.get("orange")]
markers = ["o", "*", "D"]
for (label, group), color in zip(data.groupby("Model_Type"), colors):
mrewardbench_scores = group["Avg_Multilingual"]
rewardbench_scores = group["eng_Latn"]
ax.scatter(rewardbench_scores, mrewardbench_scores, marker="o", s=40, label=label, color=color)
ax.scatter(
rewardbench_scores,
mrewardbench_scores,
marker="o",
s=60,
label=label,
color=color,
)

mrewardbench_scores = data["Avg_Multilingual"]
rewardbench_scores = data["eng_Latn"]
Expand All @@ -188,22 +224,23 @@ def plot_eng_drop_line(
ax.set_aspect("equal")
ax.legend(frameon=False, handletextpad=0.2, fontsize=12)

model_names = [MODEL_STANDARDIZATION[model] for model in data.index]
texts = [
ax.text(
rewardbench_scores[idx],
mrewardbench_scores[idx],
model_names[idx],
fontsize=14,
if top_n:
model_names = [MODEL_STANDARDIZATION[model] for model in data.index]
texts = [
ax.text(
rewardbench_scores[idx],
mrewardbench_scores[idx],
model_names[idx],
fontsize=14,
)
for idx in range(len(data))
]
adjust_text(
texts,
ax=ax,
# force_static=0.15,
arrowprops=dict(arrowstyle="->", color="gray"),
)
for idx in range(len(data))
]
adjust_text(
texts,
ax=ax,
# force_static=0.15,
arrowprops=dict(arrowstyle="->", color="gray"),
)

# ax.text(
# 0.6,
Expand Down Expand Up @@ -270,7 +307,8 @@ def plot_ling_dims(
y=dim,
data=lingdf,
ax=ax,
color="green",
color=COLORS.get("orange"),
edgecolor=COLORS.get("green"),
width=0.4 if dim == "Resource Availability" else 0.7,
)
ax.set_title(dim)
Expand Down

0 comments on commit 8b8fa99

Please sign in to comment.