Merge branch 'main' into lhw_bug_fix

open-compass · Apr 24, 2024 · 24f6ab0 · 24f6ab0
2 parents 52f38a8 + 81d0e4d
commit 24f6ab0
Show file tree

Hide file tree

Showing 794 changed files with 50,009 additions and 3,676 deletions.
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
@@ -0,0 +1,96 @@
+import csv
+import os
+
+import pytest
+import yaml
+
+output_path = 'regression_result_daily'
+
+model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf']
+dataset_list = [
+    'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa',
+    'openbookqa_fact'
+]
+
+
+@pytest.fixture()
+def baseline_scores(request):
+    config_path = os.path.join(request.config.rootdir,
+                               '.github/scripts/oc_score_baseline.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    return config
+
+
+@pytest.fixture()
+def result_scores():
+    file = find_csv_files(output_path)
+    if file is None:
+        return None
+    return read_csv_file(file)
+
+
+@pytest.mark.usefixtures('result_scores')
+@pytest.mark.usefixtures('baseline_scores')
+class TestChat:
+    """Test cases for chat model."""
+
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list
+                                                for p2 in dataset_list])
+    def test_model_dataset_score(self, baseline_scores, result_scores, model,
+                                 dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, base_score)
+
+
+def assert_score(score, baseline):
+    if score is None or score == '-':
+        assert False, 'value is none'
+    if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
+        print(score + ' between ' + str(baseline * 0.97) + ' and ' +
+              str(baseline * 1.03))
+        assert True
+    else:
+        assert False, score + ' not between ' + str(
+            baseline * 0.97) + ' and ' + str(baseline * 1.03)
+
+
+def find_csv_files(directory):
+    csv_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.csv'):
+                csv_files.append(os.path.join(root, file))
+    if len(csv_files) > 1:
+        raise 'have more than 1 result file, please check the result manually'
+    if len(csv_files) == 0:
+        return None
+    return csv_files[0]
+
+
+def read_csv_file(file_path):
+    with open(file_path, 'r') as csvfile:
+        reader = csv.DictReader(csvfile)
+        filtered_data = []
+
+        for row in reader:
+            filtered_row = {
+                k: v
+                for k, v in row.items()
+                if k not in ['version', 'metric', 'mode']
+            }
+            filtered_data.append(filtered_row)
+
+    result = {}
+    for data in filtered_data:
+        dataset = data.get('dataset')
+        for key in data.keys():
+            if key == 'dataset':
+                continue
+            else:
+                if key in result.keys():
+                    result.get(key)[dataset] = data.get(key)
+                else:
+                    result[key] = {dataset: data.get(key)}
+    return result
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
@@ -0,0 +1,23 @@
+internlm-7b-hf:
+    ARC-c: 36.27
+    chid-dev: 81.68
+    chid-test: 83.67
+    openai_humaneval: 10.37
+    openbookqa: 44.4
+    openbookqa_fact: 73.2
+
+internlm-chat-7b-hf:
+    ARC-c: 36.95
+    chid-dev: 71.78
+    chid-test: 76.87
+    openai_humaneval: 21.34
+    openbookqa: 66.6
+    openbookqa_fact: 80.4
+
+chatglm3-6b-base-hf:
+    ARC-c: 43.05
+    chid-dev: 80.2
+    chid-test: 80.77
+    openai_humaneval: 20.73
+    openbookqa: 79.8
+    openbookqa_fact: 92.2
diff --git a/.github/scripts/pr_oc_score_assert.py b/.github/scripts/pr_oc_score_assert.py
@@ -0,0 +1,77 @@
+import csv
+import os
+
+import pytest
+
+output_path = 'regression_result'
+model = 'internlm-chat-7b-hf'
+dataset = 'siqa'
+
+
+@pytest.fixture()
+def result_scores():
+    file = find_csv_files(output_path)
+    if file is None:
+        return None
+    return read_csv_file(file)
+
+
+@pytest.mark.usefixtures('result_scores')
+class TestChatScore:
+    """Test cases for chat model."""
+
+    def test_model_dataset_score(self, result_scores):
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(result_score, 73.59)
+
+
+def assert_score(score, baseline):
+    if score is None or score == '-':
+        assert False, 'value is none'
+    if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
+        print(score + ' between ' + str(baseline * 0.97) + ' and ' +
+              str(baseline * 1.03))
+        assert True
+    else:
+        assert False, score + ' not between ' + str(
+            baseline * 0.97) + ' and ' + str(baseline * 1.03)
+
+
+def find_csv_files(directory):
+    csv_files = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith('.csv'):
+                csv_files.append(os.path.join(root, file))
+    if len(csv_files) > 1:
+        raise 'have more than 1 result file, please check the result manually'
+    if len(csv_files) == 0:
+        return None
+    return csv_files[0]
+
+
+def read_csv_file(file_path):
+    with open(file_path, 'r') as csvfile:
+        reader = csv.DictReader(csvfile)
+        filtered_data = []
+
+        for row in reader:
+            filtered_row = {
+                k: v
+                for k, v in row.items()
+                if k not in ['version', 'metric', 'mode']
+            }
+            filtered_data.append(filtered_row)
+
+    result = {}
+    for data in filtered_data:
+        dataset = data.get('dataset')
+        for key in data.keys():
+            if key == 'dataset':
+                continue
+            else:
+                if key in result.keys():
+                    result.get(key)[dataset] = data.get(key)
+                else:
+                    result[key] = {dataset: data.get(key)}
+    return result
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
@@ -0,0 +1,75 @@
+name: daily_run_test
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron:  '56 16 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CONDA_ENV: opencompass_regression
+  PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
+  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
+  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+
+jobs:
+  daily_run_test:
+    runs-on: self-hosted
+    environment: 'prod'
+    timeout-minutes: 240 #4hours
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+      - name: Prepare - create conda env and install torch
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda create -y --name ${{env.CONDA_ENV}} python=3.10
+          conda activate ${{env.CONDA_ENV}}
+          pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
+          conda info --envs
+      - name: Prepare - Pip install code
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda activate ${{env.CONDA_ENV}}
+          pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
+          pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}}
+          conda info --envs
+      - name: Prepare - prepare data and hf model
+        run: |
+          cp -r ${{env.USERSPACE_PREFIX}}/data .
+          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
+          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
+          export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
+      - name:  Run test
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          rm -rf regression_result_daily
+          export from_tf=TRUE
+          python3 run.py --models hf_internlm_chat_7b hf_internlm_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily
+      - name:  Get result
+        run: |
+          eval "$(conda shell.bash hook)"
+          pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}}
+          python -m pytest -s -v --color=yes .github/scripts/oc_score_assert.py
+      - name:  Remove Conda Env
+        if: always()
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda env remove --name ${{env.CONDA_ENV}}
+          conda info --envs
+
+  notify_to_feishu:
+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    needs: [daily_run_test]
+    environment: 'prod'
+    timeout-minutes: 5
+    runs-on: self-hosted
+    steps:
+      - name: notify
+        run: |
+          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- Daily test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}
diff --git a/.github/workflows/link-check.yml b/.github/workflows/link-check.yml
@@ -0,0 +1,21 @@
+name: 'Link check'
+
+on:
+  schedule:
+    # check links at 01:30 a.m. every day
+    - cron: '30 1 * * *'
+
+jobs:
+  link-check:
+    runs-on: ubuntu-latest
+    steps:
+      # - uses: actions/checkout@v3
+
+      - name: linkchecker
+        run: |
+          pip install linkchecker
+          linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30 --no-warnings |
+            --ignore-url https://opencompass\.readthedocs\.io/.*/static/images/opencompass_logo\.svg |
+            --ignore-url https://opencompass\.readthedocs\.io/.*/_static/images/icon-menu-dots\.svg |
+            --ignore-url https://opencompass\.readthedocs\.io/policy |
+            --ignore-url https://opencompass\.readthedocs\.io/(en|zh_CN)/[0-9a-f]{40}/.*
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
@@ -0,0 +1,80 @@
+name: pr_run_test
+
+on:
+  pull_request:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'configs/**'
+      - 'tools/**'
+
+  workflow_dispatch:
+  schedule:
+    - cron:  '56 22 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  CONDA_ENV: opencompass_base
+  USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
+  HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
+
+jobs:
+  pr_run_test:
+    runs-on: self-hosted
+    environment: 'prod'
+    timeout-minutes: 30
+    steps:
+      - name: Clone repository
+        uses: actions/checkout@v2
+      - name: Prepare - Install opencompass
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda activate ${{env.CONDA_ENV}}
+          python3 -m pip uninstall opencompass -y
+          python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
+          conda info --envs
+      - name: Prepare - prepare data and hf model
+        run: |
+          cp -r ${{env.USERSPACE_PREFIX}}/data .
+          rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
+          ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
+          export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
+      - name:  Run test
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda activate ${{env.CONDA_ENV}}
+          conda info --envs
+          rm -rf regression_result
+          python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug
+      - name:  Get result
+        run: |
+          score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
+          if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then
+             echo "score is $score between 70 and 75"
+          else
+             echo "score is $score not between 70 and 75"
+             exit 1
+          fi
+          rm -rf regression_result
+      - name:  Uninstall opencompass
+        if: always()
+        run: |
+          eval "$(conda shell.bash hook)"
+          conda activate ${{env.CONDA_ENV}}
+          python3 -m pip uninstall opencompass -y
+          conda info --envs
+
+  notify_to_feishu:
+    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    needs: [pr_run_test]
+    environment: 'prod'
+    timeout-minutes: 5
+    runs-on: self-hosted
+    steps:
+      - name: notify
+        run: |
+          curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}'  ${{ secrets.WEBHOOK_URL }}