-
Notifications
You must be signed in to change notification settings - Fork 435
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into lhw_bug_fix
- Loading branch information
Showing
794 changed files
with
50,009 additions
and
3,676 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import csv | ||
import os | ||
|
||
import pytest | ||
import yaml | ||
|
||
output_path = 'regression_result_daily' | ||
|
||
model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf'] | ||
dataset_list = [ | ||
'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa', | ||
'openbookqa_fact' | ||
] | ||
|
||
|
||
@pytest.fixture() | ||
def baseline_scores(request): | ||
config_path = os.path.join(request.config.rootdir, | ||
'.github/scripts/oc_score_baseline.yaml') | ||
with open(config_path) as f: | ||
config = yaml.load(f.read(), Loader=yaml.SafeLoader) | ||
return config | ||
|
||
|
||
@pytest.fixture() | ||
def result_scores(): | ||
file = find_csv_files(output_path) | ||
if file is None: | ||
return None | ||
return read_csv_file(file) | ||
|
||
|
||
@pytest.mark.usefixtures('result_scores') | ||
@pytest.mark.usefixtures('baseline_scores') | ||
class TestChat: | ||
"""Test cases for chat model.""" | ||
|
||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list | ||
for p2 in dataset_list]) | ||
def test_model_dataset_score(self, baseline_scores, result_scores, model, | ||
dataset): | ||
base_score = baseline_scores.get(model).get(dataset) | ||
result_score = result_scores.get(model).get(dataset) | ||
assert_score(result_score, base_score) | ||
|
||
|
||
def assert_score(score, baseline): | ||
if score is None or score == '-': | ||
assert False, 'value is none' | ||
if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97): | ||
print(score + ' between ' + str(baseline * 0.97) + ' and ' + | ||
str(baseline * 1.03)) | ||
assert True | ||
else: | ||
assert False, score + ' not between ' + str( | ||
baseline * 0.97) + ' and ' + str(baseline * 1.03) | ||
|
||
|
||
def find_csv_files(directory): | ||
csv_files = [] | ||
for root, dirs, files in os.walk(directory): | ||
for file in files: | ||
if file.endswith('.csv'): | ||
csv_files.append(os.path.join(root, file)) | ||
if len(csv_files) > 1: | ||
raise 'have more than 1 result file, please check the result manually' | ||
if len(csv_files) == 0: | ||
return None | ||
return csv_files[0] | ||
|
||
|
||
def read_csv_file(file_path): | ||
with open(file_path, 'r') as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
filtered_data = [] | ||
|
||
for row in reader: | ||
filtered_row = { | ||
k: v | ||
for k, v in row.items() | ||
if k not in ['version', 'metric', 'mode'] | ||
} | ||
filtered_data.append(filtered_row) | ||
|
||
result = {} | ||
for data in filtered_data: | ||
dataset = data.get('dataset') | ||
for key in data.keys(): | ||
if key == 'dataset': | ||
continue | ||
else: | ||
if key in result.keys(): | ||
result.get(key)[dataset] = data.get(key) | ||
else: | ||
result[key] = {dataset: data.get(key)} | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
internlm-7b-hf: | ||
ARC-c: 36.27 | ||
chid-dev: 81.68 | ||
chid-test: 83.67 | ||
openai_humaneval: 10.37 | ||
openbookqa: 44.4 | ||
openbookqa_fact: 73.2 | ||
|
||
internlm-chat-7b-hf: | ||
ARC-c: 36.95 | ||
chid-dev: 71.78 | ||
chid-test: 76.87 | ||
openai_humaneval: 21.34 | ||
openbookqa: 66.6 | ||
openbookqa_fact: 80.4 | ||
|
||
chatglm3-6b-base-hf: | ||
ARC-c: 43.05 | ||
chid-dev: 80.2 | ||
chid-test: 80.77 | ||
openai_humaneval: 20.73 | ||
openbookqa: 79.8 | ||
openbookqa_fact: 92.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import csv | ||
import os | ||
|
||
import pytest | ||
|
||
output_path = 'regression_result' | ||
model = 'internlm-chat-7b-hf' | ||
dataset = 'siqa' | ||
|
||
|
||
@pytest.fixture() | ||
def result_scores(): | ||
file = find_csv_files(output_path) | ||
if file is None: | ||
return None | ||
return read_csv_file(file) | ||
|
||
|
||
@pytest.mark.usefixtures('result_scores') | ||
class TestChatScore: | ||
"""Test cases for chat model.""" | ||
|
||
def test_model_dataset_score(self, result_scores): | ||
result_score = result_scores.get(model).get(dataset) | ||
assert_score(result_score, 73.59) | ||
|
||
|
||
def assert_score(score, baseline): | ||
if score is None or score == '-': | ||
assert False, 'value is none' | ||
if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97): | ||
print(score + ' between ' + str(baseline * 0.97) + ' and ' + | ||
str(baseline * 1.03)) | ||
assert True | ||
else: | ||
assert False, score + ' not between ' + str( | ||
baseline * 0.97) + ' and ' + str(baseline * 1.03) | ||
|
||
|
||
def find_csv_files(directory): | ||
csv_files = [] | ||
for root, dirs, files in os.walk(directory): | ||
for file in files: | ||
if file.endswith('.csv'): | ||
csv_files.append(os.path.join(root, file)) | ||
if len(csv_files) > 1: | ||
raise 'have more than 1 result file, please check the result manually' | ||
if len(csv_files) == 0: | ||
return None | ||
return csv_files[0] | ||
|
||
|
||
def read_csv_file(file_path): | ||
with open(file_path, 'r') as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
filtered_data = [] | ||
|
||
for row in reader: | ||
filtered_row = { | ||
k: v | ||
for k, v in row.items() | ||
if k not in ['version', 'metric', 'mode'] | ||
} | ||
filtered_data.append(filtered_row) | ||
|
||
result = {} | ||
for data in filtered_data: | ||
dataset = data.get('dataset') | ||
for key in data.keys(): | ||
if key == 'dataset': | ||
continue | ||
else: | ||
if key in result.keys(): | ||
result.get(key)[dataset] = data.get(key) | ||
else: | ||
result[key] = {dataset: data.get(key)} | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
name: daily_run_test | ||
|
||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: '56 16 * * *' | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
env: | ||
CONDA_ENV: opencompass_regression | ||
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip | ||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd | ||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub | ||
|
||
jobs: | ||
daily_run_test: | ||
runs-on: self-hosted | ||
environment: 'prod' | ||
timeout-minutes: 240 #4hours | ||
steps: | ||
- name: Clone repository | ||
uses: actions/checkout@v2 | ||
- name: Prepare - create conda env and install torch | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda create -y --name ${{env.CONDA_ENV}} python=3.10 | ||
conda activate ${{env.CONDA_ENV}} | ||
pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118 | ||
conda info --envs | ||
- name: Prepare - Pip install code | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda activate ${{env.CONDA_ENV}} | ||
pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} | ||
pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}} | ||
conda info --envs | ||
- name: Prepare - prepare data and hf model | ||
run: | | ||
cp -r ${{env.USERSPACE_PREFIX}}/data . | ||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p | ||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub | ||
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; | ||
- name: Run test | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda activate ${{env.CONDA_ENV}} | ||
conda info --envs | ||
rm -rf regression_result_daily | ||
export from_tf=TRUE | ||
python3 run.py --models hf_internlm_chat_7b hf_internlm_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily | ||
- name: Get result | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}} | ||
python -m pytest -s -v --color=yes .github/scripts/oc_score_assert.py | ||
- name: Remove Conda Env | ||
if: always() | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda env remove --name ${{env.CONDA_ENV}} | ||
conda info --envs | ||
notify_to_feishu: | ||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} | ||
needs: [daily_run_test] | ||
environment: 'prod' | ||
timeout-minutes: 5 | ||
runs-on: self-hosted | ||
steps: | ||
- name: notify | ||
run: | | ||
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- Daily test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
name: 'Link check' | ||
|
||
on: | ||
schedule: | ||
# check links at 01:30 a.m. every day | ||
- cron: '30 1 * * *' | ||
|
||
jobs: | ||
link-check: | ||
runs-on: ubuntu-latest | ||
steps: | ||
# - uses: actions/checkout@v3 | ||
|
||
- name: linkchecker | ||
run: | | ||
pip install linkchecker | ||
linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30 --no-warnings | | ||
--ignore-url https://opencompass\.readthedocs\.io/.*/static/images/opencompass_logo\.svg | | ||
--ignore-url https://opencompass\.readthedocs\.io/.*/_static/images/icon-menu-dots\.svg | | ||
--ignore-url https://opencompass\.readthedocs\.io/policy | | ||
--ignore-url https://opencompass\.readthedocs\.io/(en|zh_CN)/[0-9a-f]{40}/.* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
name: pr_run_test | ||
|
||
on: | ||
pull_request: | ||
paths-ignore: | ||
- 'README.md' | ||
- 'README_zh-CN.md' | ||
- 'docs/**' | ||
- 'configs/**' | ||
- 'tools/**' | ||
|
||
workflow_dispatch: | ||
schedule: | ||
- cron: '56 22 * * *' | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
env: | ||
CONDA_ENV: opencompass_base | ||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd | ||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub | ||
|
||
jobs: | ||
pr_run_test: | ||
runs-on: self-hosted | ||
environment: 'prod' | ||
timeout-minutes: 30 | ||
steps: | ||
- name: Clone repository | ||
uses: actions/checkout@v2 | ||
- name: Prepare - Install opencompass | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda activate ${{env.CONDA_ENV}} | ||
python3 -m pip uninstall opencompass -y | ||
python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip | ||
conda info --envs | ||
- name: Prepare - prepare data and hf model | ||
run: | | ||
cp -r ${{env.USERSPACE_PREFIX}}/data . | ||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p | ||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub | ||
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; | ||
- name: Run test | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda activate ${{env.CONDA_ENV}} | ||
conda info --envs | ||
rm -rf regression_result | ||
python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug | ||
- name: Get result | ||
run: | | ||
score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}') | ||
if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then | ||
echo "score is $score between 70 and 75" | ||
else | ||
echo "score is $score not between 70 and 75" | ||
exit 1 | ||
fi | ||
rm -rf regression_result | ||
- name: Uninstall opencompass | ||
if: always() | ||
run: | | ||
eval "$(conda shell.bash hook)" | ||
conda activate ${{env.CONDA_ENV}} | ||
python3 -m pip uninstall opencompass -y | ||
conda info --envs | ||
notify_to_feishu: | ||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} | ||
needs: [pr_run_test] | ||
environment: 'prod' | ||
timeout-minutes: 5 | ||
runs-on: self-hosted | ||
steps: | ||
- name: notify | ||
run: | | ||
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }} |
Oops, something went wrong.