Merge remote-tracking branch 'upstream/main' into DecodingTrust

# Conflicts: # setup.cfg # src/helm/benchmark/run_specs.py # src/helm/benchmark/scenarios/opinions_qa_scenario.py # src/helm/benchmark/static/schema.yaml
AI-secure · Oct 19, 2023 · bcd4dc7 · bcd4dc7
2 parents b5f91f8 + 45e144d
commit bcd4dc7
Show file tree

Hide file tree

Showing 307 changed files with 27,970 additions and 1,750 deletions.
diff --git a/.github/workflows/frontend.yml b/.github/workflows/frontend.yml
@@ -0,0 +1,79 @@
+name: Frontend
+
+on:
+  push:
+    branches:
+      - '*'
+    paths:
+      - 'src/helm-frontend/**'
+  pull_request:
+    branches:
+      - '*'
+    paths:
+      - 'src/helm-frontend/**'
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Use Node.js
+      uses: actions/setup-node@v3
+      with:
+        node-version: '18'
+    - name: Install dependencies
+      working-directory: ./src/helm-frontend
+      run: npm ci
+    - name: Run lint
+      working-directory: ./src/helm-frontend
+      run: npm run lint
+    - name: Run check format
+      working-directory: ./src/helm-frontend
+      run: npm run format:check
+    - name: Run tests
+      working-directory: ./src/helm-frontend
+      run: npm run test
+
+  build:
+    runs-on: ubuntu-latest
+    # Deploy to only run on pushes to master
+    # if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    if: github.event_name == 'push' && github.ref == 'refs/heads/react_frontend'
+    needs: test
+    environment:
+      name: github-pages
+    env:
+      VITE_HELM_BENCHMARKS_ENDPOINT: ${{ vars.VITE_HELM_BENCHMARKS_ENDPOINT }}
+      VITE_HELM_BENCHMARKS_SUITE: ${{ vars.VITE_HELM_BENCHMARKS_SUITE }}
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+    - name: Use Node.js
+      uses: actions/setup-node@v3
+      with:
+        node-version: '18'
+    - name: Install dependencies
+      working-directory: ./src/helm-frontend
+      run: npm ci
+    - name: Build app
+      working-directory: ./src/helm-frontend
+      run: npm run build
+    - name: Upload artifact
+      uses: actions/upload-pages-artifact@v2
+      with:
+        path: ./src/helm/benchmark/static_build/
+
+  deploy:
+    runs-on: ubuntu-latest
+    needs: build
+    permissions:
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v2
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -1,7 +1,7 @@
 # This workflow will upload a Python Package using Twine when a release is created
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 
-name: Upload Python Package
+name: Publish Python package to PyPI
 
 on:
   release:
@@ -11,10 +11,14 @@ permissions:
   contents: read
 
 jobs:
-  deploy:
-
+  pypi-publish:
+    name: Publish Python package to PyPI
     runs-on: ubuntu-latest
-
+    environment:
+      name: pypi
+      url: https://pypi.org/p/crfm-helm
+    permissions:
+      id-token: write
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python
@@ -28,7 +32,4 @@ jobs:
     - name: Build package
       run: python -m build
     - name: Publish package
-      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
-      with:
-        user: __token__
-        password: ${{ secrets.PYPI_API_TOKEN }}
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -2,10 +2,16 @@ name: Test
 on:
   push:
     branches: [ main ]
+    paths-ignore:
+      - 'src/helm-frontend/**'
   pull_request:
+    paths-ignore:
+      - 'src/helm-frontend/**'
 
 jobs:
   install:
+    # Tests that the HELM CLI commands work when only installing required dependencies
+    # without optional extra dependencies.
     name: Install
     runs-on: ubuntu-latest
     strategy:
@@ -20,7 +26,7 @@ jobs:
       - uses: actions/cache@v2
         with:
           path: ~/.cache/pip
-          key: pip-${{ hashFiles('requirements-freeze.txt') }}-${{ matrix.python-version }}
+          key: pip-${{ hashFiles('requirements.txt') }}-${{ matrix.python-version }}
           restore-keys: |
             pip-
       - run: pip install -e .
@@ -43,16 +49,15 @@ jobs:
       - uses: actions/cache@v2
         with:
           path: ~/.cache/pip
-          key: pip-${{ hashFiles('requirements-freeze.txt') }}
+          key: pip-${{ hashFiles('requirements.txt') }}
           restore-keys: |
             pip-
       # Installs dependencies and performs static code checks
       - run: python3 -m pip install virtualenv && python3 -m virtualenv -p python3 venv
       - run: source venv/bin/activate && ./install-dev.sh
       - run: source venv/bin/activate && ./pre-commit.sh
       - name: Run tests
-        # Skip ICE tokenizer tests. GHA is having trouble downloading ice_text.model.
-        run: source venv/bin/activate && pytest --ignore src/helm/benchmark/window_services/test_ice_window_service.py --ignore src/helm/proxy/clients/test_ice_tokenizer_client.py
+        run: source venv/bin/activate && pytest
         env:
           TEST: ${{ matrix.test }}
           VERSION: ${{ github.head_ref || 'main' }}

diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ benchmark_output
 www
 proxy_api_key*.txt
 microsoft_client.lock
+*.lock
 *.log
 *.out
 *.jsonl

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -4,4 +4,9 @@ repos:
     - id: run-pre-commit
       name: run-pre-commit
       entry: ./pre-commit.sh
-      language: script
+      language: script
+- repo: https://github.com/pre-commit/mirrors-prettier
+  rev: 'fc260393cc4ec09f8fc0a5ba4437f481c8b55dc1'
+  hooks:
+    - id: prettier
+      types_or: [tsx, javascript]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,49 @@
 
 ## [Upcoming]
 
+## [v0.2.4] - 2023-09-20
+
+### Models
+
+- Added Meta LLaMA, Meta Llama 2, EleutherAI Pythia, Together RedPajama on Together (#1821)
+- Removed the unofficial chat-gpt client in favor of the official API (#1809)
+- Added support for models for the NeurIPS Efficiency Challenge (#1693)
+
+### Frontend
+
+- Added support for rendering train-test overlap stats in the frontend (#1747)
+- Fixed a bug where stats with NaN values would cause the frontend to fail to render tables (#1784)
+
+### Framework
+
+- Moved many dependencies, especially those only used by a single model provider or a small number of runs, to optional extra dependencies (#1798, #1844)
+- Widened some dependencies (e.g. PyTorch) to reduce dependency conflicts with other packages  (#1759)
+- Added `MaxEvalInstancesRunExpander` to allow overriding the number of eval instances at the run level (#1837)
+- Updated human critique evaluation on Amazon Mechanical Turk to support emoji and other special characters (#1773)
+- Fixed a bug where in-context learning examples with multiple correct references were adapted to prompts where all the correct references are concatenated together as the output, which was not intended for some scenarios (e.g. narrative_qa, natural_qa, quac and wikifact) (#1785)
+- Fixed a bug where ObjectSpec is not hashable if any arg is a list (#1771)
+
+### Evaluations
+
+- Added evaluation results for Meta LLaMA, Meta Llama 2, EleutherAI Pythia, Together RedPajama on Together
+- Corrected evaluation results for AI21 Jurassic-2 and Writer Palmyra for the scenarios narrative_qa, natural_qa, quac and wikifact, as they were affected by the bug fixed by #1785
+
+### Contributors
+
+Thank you to the following contributors for your contributions to this HELM release!
+
+- @AndrewJGaut
+- @andyzorigin
+- @bidyapati-p
+- @drisspg
+- @mkly
+- @msaroufim
+- @percyliang
+- @teetone
+- @timothylimyl
+- @unnawut
+- @yifanmai
+
 ## [v0.2.3] - 2023-07-25
 
 ### Models
@@ -134,7 +177,8 @@
 
 - Initial release
 
-[upcoming]: https://github.com/stanford-crfm/helm/compare/v0.2.3...HEAD
+[upcoming]: https://github.com/stanford-crfm/helm/compare/v0.2.4...HEAD
+[v0.2.3]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.4
 [v0.2.3]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.3
 [v0.2.2]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.2
 [v0.2.1]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.1

diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -33,9 +33,9 @@ The meaning of the additional arguments are as follows:
 `helm-run` creates an environment directory environment and an output directory by default.
 
 -  The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory.
--  The output directory is `benchmarking_output/` by default and can be set using `--output-path`.
+-  The output directory is `benchmark_output/` by default and can be set using `--output-path`.
 
-After running this command, navigate to the `benchmarking_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=huggingface_gpt-2` and `mmlu:subject=philosophy,model=huggingface_gpt-2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`.
+After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=huggingface_gpt-2` and `mmlu:subject=philosophy,model=huggingface_gpt-2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`.
 
 Each output sub-directory will contain several JSON files that were generated during the corresponding run:
 

diff --git a/install-dev.sh b/install-dev.sh
@@ -9,10 +9,10 @@ if [[ $OSTYPE != 'darwin'* ]]; then
   # Manually install pytorch to avoid pip getting killed: https://stackoverflow.com/a/54329850
   pip install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.1+cu113 torchvision==0.13.1+cu113
 fi
-# Manually install protobuf to workaround issue: https://github.com/protocolbuffers/protobuf/issues/6550
-pip install --no-binary=protobuf protobuf==3.20.2
 # Install all pinned dependencies
-pip install -r requirements-freeze.txt
+pip install -r requirements.txt
+# upgrade pip to install in edit mode without setup.py
+pip install --upgrade pip
 # Install HELM in edit mode
 pip install -e .[all]
 # Check dependencies

diff --git a/requirements-dev.txt b/requirements-dev.txt
diff --git a/requirements-freeze.txt → requirements.txt b/requirements-freeze.txt → requirements.txt
@@ -1,7 +1,7 @@
 2captcha-python==1.1.3
 absl-py==1.2.0
 aiodns==3.0.0
-aiohttp==3.8.3
+aiohttp==3.8.5
 aiohttp-retry==2.8.3
 aiosignal==1.2.0
 aleph-alpha-client==2.14.0
@@ -10,7 +10,7 @@ async-generator==1.10
 async-timeout==4.0.2
 attrs==22.1.0
 beautifulsoup4==4.11.1
-bert-score==0.3.11
+bert-score==0.3.13
 bitarray==2.7.3
 black==22.10.0
 blanc==0.2.7
@@ -21,7 +21,7 @@ bottle==0.12.23
 cachetools==5.2.0
 catalogue==2.0.8
 cattrs==22.2.0
-certifi==2022.12.7
+certifi==2023.7.22
 cffi==1.15.1
 cfgv==3.3.1
 charset-normalizer==2.1.1
@@ -55,7 +55,7 @@ greenlet==1.1.3
 gunicorn==20.1.0
 h11==0.14.0
 httplib2==0.20.4
-huggingface-hub==0.11.0
+huggingface-hub==0.15.1
 icetk==0.0.4
 identify==2.5.6
 idna==3.4
@@ -75,12 +75,12 @@ MarkupSafe==2.1.1
 matplotlib==3.6.0
 mccabe==0.7.0
 moverscore==1.0.3
-mpmath==1.2.1
+mpmath==1.3.0
 multidict==6.0.2
 multiprocess==0.70.13
 murmurhash==1.0.8
-mypy==0.982
-mypy-extensions==0.4.3
+mypy==1.5.1
+mypy-extensions==1.0.0
 networkx==2.8.7
 nltk==3.7
 nodeenv==1.7.0
@@ -94,7 +94,7 @@ pandas==1.5.0
 pandas-stubs==1.5.0.221003
 parameterized==0.8.1
 pathspec==0.10.1
-pathy==0.6.2
+pathy==0.10.2
 Pillow==9.3.0
 platformdirs==2.5.2
 pluggy==1.0.0
@@ -124,7 +124,7 @@ pytrec-eval==0.5
 pytz==2022.4
 PyYAML==6.0
 regex==2022.9.13
-requests==2.28.1
+requests==2.31.0
 responses==0.18.0
 retrying==1.3.4
 rouge-score==0.1.2
@@ -134,18 +134,17 @@ sacrebleu==2.2.1
 sacremoses==0.0.53
 scaleapi==2.13.0
 scikit-learn==1.1.2
-scipy==1.9.1
+scipy==1.10.0
 selenium==4.8.0
 sentencepiece==0.1.97
 simple-slurm==0.2.6
 six==1.16.0
-sklearn==0.0
 smart-open==5.2.1
 sniffio==1.3.0
 sortedcontainers==2.4.0
 soupsieve==2.3.2.post1
-spacy==3.2.4
-spacy-legacy==3.0.10
+spacy==3.5.4
+spacy-legacy==3.0.12
 spacy-loggers==1.0.3
 sqlitedict==1.7.0
 srsly==2.4.4
@@ -154,7 +153,7 @@ summ-eval==0.892
 surge-api==1.1.0
 sympy==1.11.1
 tabulate==0.9.0
-thinc==8.0.17
+thinc==8.1.12
 threadpoolctl==3.1.0
 tiktoken==0.3.3
 tls-client==0.1.8
@@ -166,18 +165,19 @@ torchvision==0.13.1 ; sys_platform == "darwin"
 torch==1.12.1+cu113 ; sys_platform == "linux"
 torchvision==0.13.1+cu113 ; sys_platform == "linux"
 tqdm==4.64.1
-transformers==4.28.1
+transformers==4.33.1
 trio==0.22.0
 trio-websocket==0.9.2
 typer==0.4.2
+types-Pillow==9.3.0.4
 types-pytz==2022.4.0.0
 types-redis==4.3.21.1
 types-requests==2.28.11.2
 types-tabulate==0.9.0.0
 types-urllib3==1.26.25
 typing==3.7.4.3
 typing_extensions==4.4.0
-uncertainty-calibration==0.1.3
+uncertainty-calibration==0.1.4
 undetected-chromedriver==3.2.1
 uritemplate==4.1.1
 urllib3==1.26.12