Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into DecodingTrust
Browse files Browse the repository at this point in the history
# Conflicts:
#	setup.cfg
#	src/helm/benchmark/run_specs.py
#	src/helm/benchmark/scenarios/opinions_qa_scenario.py
#	src/helm/benchmark/static/schema.yaml
  • Loading branch information
danielz02 committed Oct 19, 2023
2 parents b5f91f8 + 45e144d commit bcd4dc7
Show file tree
Hide file tree
Showing 307 changed files with 27,970 additions and 1,750 deletions.
79 changes: 79 additions & 0 deletions .github/workflows/frontend.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: Frontend

on:
push:
branches:
- '*'
paths:
- 'src/helm-frontend/**'
pull_request:
branches:
- '*'
paths:
- 'src/helm-frontend/**'

jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Use Node.js
uses: actions/setup-node@v3
with:
node-version: '18'
- name: Install dependencies
working-directory: ./src/helm-frontend
run: npm ci
- name: Run lint
working-directory: ./src/helm-frontend
run: npm run lint
- name: Run check format
working-directory: ./src/helm-frontend
run: npm run format:check
- name: Run tests
working-directory: ./src/helm-frontend
run: npm run test

build:
runs-on: ubuntu-latest
# Deploy to only run on pushes to master
# if: github.event_name == 'push' && github.ref == 'refs/heads/main'
if: github.event_name == 'push' && github.ref == 'refs/heads/react_frontend'
needs: test
environment:
name: github-pages
env:
VITE_HELM_BENCHMARKS_ENDPOINT: ${{ vars.VITE_HELM_BENCHMARKS_ENDPOINT }}
VITE_HELM_BENCHMARKS_SUITE: ${{ vars.VITE_HELM_BENCHMARKS_SUITE }}
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Use Node.js
uses: actions/setup-node@v3
with:
node-version: '18'
- name: Install dependencies
working-directory: ./src/helm-frontend
run: npm ci
- name: Build app
working-directory: ./src/helm-frontend
run: npm run build
- name: Upload artifact
uses: actions/upload-pages-artifact@v2
with:
path: ./src/helm/benchmark/static_build/

deploy:
runs-on: ubuntu-latest
needs: build
permissions:
pages: write
id-token: write
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v2
17 changes: 9 additions & 8 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries

name: Upload Python Package
name: Publish Python package to PyPI

on:
release:
Expand All @@ -11,10 +11,14 @@ permissions:
contents: read

jobs:
deploy:

pypi-publish:
name: Publish Python package to PyPI
runs-on: ubuntu-latest

environment:
name: pypi
url: https://pypi.org/p/crfm-helm
permissions:
id-token: write
steps:
- uses: actions/checkout@v3
- name: Set up Python
Expand All @@ -28,7 +32,4 @@ jobs:
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}
uses: pypa/gh-action-pypi-publish@release/v1
13 changes: 9 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,16 @@ name: Test
on:
push:
branches: [ main ]
paths-ignore:
- 'src/helm-frontend/**'
pull_request:
paths-ignore:
- 'src/helm-frontend/**'

jobs:
install:
# Tests that the HELM CLI commands work when only installing required dependencies
# without optional extra dependencies.
name: Install
runs-on: ubuntu-latest
strategy:
Expand All @@ -20,7 +26,7 @@ jobs:
- uses: actions/cache@v2
with:
path: ~/.cache/pip
key: pip-${{ hashFiles('requirements-freeze.txt') }}-${{ matrix.python-version }}
key: pip-${{ hashFiles('requirements.txt') }}-${{ matrix.python-version }}
restore-keys: |
pip-
- run: pip install -e .
Expand All @@ -43,16 +49,15 @@ jobs:
- uses: actions/cache@v2
with:
path: ~/.cache/pip
key: pip-${{ hashFiles('requirements-freeze.txt') }}
key: pip-${{ hashFiles('requirements.txt') }}
restore-keys: |
pip-
# Installs dependencies and performs static code checks
- run: python3 -m pip install virtualenv && python3 -m virtualenv -p python3 venv
- run: source venv/bin/activate && ./install-dev.sh
- run: source venv/bin/activate && ./pre-commit.sh
- name: Run tests
# Skip ICE tokenizer tests. GHA is having trouble downloading ice_text.model.
run: source venv/bin/activate && pytest --ignore src/helm/benchmark/window_services/test_ice_window_service.py --ignore src/helm/proxy/clients/test_ice_tokenizer_client.py
run: source venv/bin/activate && pytest
env:
TEST: ${{ matrix.test }}
VERSION: ${{ github.head_ref || 'main' }}
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ benchmark_output
www
proxy_api_key*.txt
microsoft_client.lock
*.lock
*.log
*.out
*.jsonl
Expand Down
7 changes: 6 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ repos:
- id: run-pre-commit
name: run-pre-commit
entry: ./pre-commit.sh
language: script
language: script
- repo: https://github.com/pre-commit/mirrors-prettier
rev: 'fc260393cc4ec09f8fc0a5ba4437f481c8b55dc1'
hooks:
- id: prettier
types_or: [tsx, javascript]
46 changes: 45 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,49 @@

## [Upcoming]

## [v0.2.4] - 2023-09-20

### Models

- Added Meta LLaMA, Meta Llama 2, EleutherAI Pythia, Together RedPajama on Together (#1821)
- Removed the unofficial chat-gpt client in favor of the official API (#1809)
- Added support for models for the NeurIPS Efficiency Challenge (#1693)

### Frontend

- Added support for rendering train-test overlap stats in the frontend (#1747)
- Fixed a bug where stats with NaN values would cause the frontend to fail to render tables (#1784)

### Framework

- Moved many dependencies, especially those only used by a single model provider or a small number of runs, to optional extra dependencies (#1798, #1844)
- Widened some dependencies (e.g. PyTorch) to reduce dependency conflicts with other packages (#1759)
- Added `MaxEvalInstancesRunExpander` to allow overriding the number of eval instances at the run level (#1837)
- Updated human critique evaluation on Amazon Mechanical Turk to support emoji and other special characters (#1773)
- Fixed a bug where in-context learning examples with multiple correct references were adapted to prompts where all the correct references are concatenated together as the output, which was not intended for some scenarios (e.g. narrative_qa, natural_qa, quac and wikifact) (#1785)
- Fixed a bug where ObjectSpec is not hashable if any arg is a list (#1771)

### Evaluations

- Added evaluation results for Meta LLaMA, Meta Llama 2, EleutherAI Pythia, Together RedPajama on Together
- Corrected evaluation results for AI21 Jurassic-2 and Writer Palmyra for the scenarios narrative_qa, natural_qa, quac and wikifact, as they were affected by the bug fixed by #1785

### Contributors

Thank you to the following contributors for your contributions to this HELM release!

- @AndrewJGaut
- @andyzorigin
- @bidyapati-p
- @drisspg
- @mkly
- @msaroufim
- @percyliang
- @teetone
- @timothylimyl
- @unnawut
- @yifanmai

## [v0.2.3] - 2023-07-25

### Models
Expand Down Expand Up @@ -134,7 +177,8 @@

- Initial release

[upcoming]: https://github.com/stanford-crfm/helm/compare/v0.2.3...HEAD
[upcoming]: https://github.com/stanford-crfm/helm/compare/v0.2.4...HEAD
[v0.2.3]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.4
[v0.2.3]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.3
[v0.2.2]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.2
[v0.2.1]: https://github.com/stanford-crfm/helm/releases/tag/v0.2.1
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ The meaning of the additional arguments are as follows:
`helm-run` creates an environment directory environment and an output directory by default.

- The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory.
- The output directory is `benchmarking_output/` by default and can be set using `--output-path`.
- The output directory is `benchmark_output/` by default and can be set using `--output-path`.

After running this command, navigate to the `benchmarking_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=huggingface_gpt-2` and `mmlu:subject=philosophy,model=huggingface_gpt-2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`.
After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=huggingface_gpt-2` and `mmlu:subject=philosophy,model=huggingface_gpt-2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`.

Each output sub-directory will contain several JSON files that were generated during the corresponding run:

Expand Down
6 changes: 3 additions & 3 deletions install-dev.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ if [[ $OSTYPE != 'darwin'* ]]; then
# Manually install pytorch to avoid pip getting killed: https://stackoverflow.com/a/54329850
pip install --no-cache-dir --find-links https://download.pytorch.org/whl/torch_stable.html torch==1.12.1+cu113 torchvision==0.13.1+cu113
fi
# Manually install protobuf to workaround issue: https://github.com/protocolbuffers/protobuf/issues/6550
pip install --no-binary=protobuf protobuf==3.20.2
# Install all pinned dependencies
pip install -r requirements-freeze.txt
pip install -r requirements.txt
# upgrade pip to install in edit mode without setup.py
pip install --upgrade pip
# Install HELM in edit mode
pip install -e .[all]
# Check dependencies
Expand Down
6 changes: 0 additions & 6 deletions requirements-dev.txt

This file was deleted.

32 changes: 16 additions & 16 deletions requirements-freeze.txt → requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
2captcha-python==1.1.3
absl-py==1.2.0
aiodns==3.0.0
aiohttp==3.8.3
aiohttp==3.8.5
aiohttp-retry==2.8.3
aiosignal==1.2.0
aleph-alpha-client==2.14.0
Expand All @@ -10,7 +10,7 @@ async-generator==1.10
async-timeout==4.0.2
attrs==22.1.0
beautifulsoup4==4.11.1
bert-score==0.3.11
bert-score==0.3.13
bitarray==2.7.3
black==22.10.0
blanc==0.2.7
Expand All @@ -21,7 +21,7 @@ bottle==0.12.23
cachetools==5.2.0
catalogue==2.0.8
cattrs==22.2.0
certifi==2022.12.7
certifi==2023.7.22
cffi==1.15.1
cfgv==3.3.1
charset-normalizer==2.1.1
Expand Down Expand Up @@ -55,7 +55,7 @@ greenlet==1.1.3
gunicorn==20.1.0
h11==0.14.0
httplib2==0.20.4
huggingface-hub==0.11.0
huggingface-hub==0.15.1
icetk==0.0.4
identify==2.5.6
idna==3.4
Expand All @@ -75,12 +75,12 @@ MarkupSafe==2.1.1
matplotlib==3.6.0
mccabe==0.7.0
moverscore==1.0.3
mpmath==1.2.1
mpmath==1.3.0
multidict==6.0.2
multiprocess==0.70.13
murmurhash==1.0.8
mypy==0.982
mypy-extensions==0.4.3
mypy==1.5.1
mypy-extensions==1.0.0
networkx==2.8.7
nltk==3.7
nodeenv==1.7.0
Expand All @@ -94,7 +94,7 @@ pandas==1.5.0
pandas-stubs==1.5.0.221003
parameterized==0.8.1
pathspec==0.10.1
pathy==0.6.2
pathy==0.10.2
Pillow==9.3.0
platformdirs==2.5.2
pluggy==1.0.0
Expand Down Expand Up @@ -124,7 +124,7 @@ pytrec-eval==0.5
pytz==2022.4
PyYAML==6.0
regex==2022.9.13
requests==2.28.1
requests==2.31.0
responses==0.18.0
retrying==1.3.4
rouge-score==0.1.2
Expand All @@ -134,18 +134,17 @@ sacrebleu==2.2.1
sacremoses==0.0.53
scaleapi==2.13.0
scikit-learn==1.1.2
scipy==1.9.1
scipy==1.10.0
selenium==4.8.0
sentencepiece==0.1.97
simple-slurm==0.2.6
six==1.16.0
sklearn==0.0
smart-open==5.2.1
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.3.2.post1
spacy==3.2.4
spacy-legacy==3.0.10
spacy==3.5.4
spacy-legacy==3.0.12
spacy-loggers==1.0.3
sqlitedict==1.7.0
srsly==2.4.4
Expand All @@ -154,7 +153,7 @@ summ-eval==0.892
surge-api==1.1.0
sympy==1.11.1
tabulate==0.9.0
thinc==8.0.17
thinc==8.1.12
threadpoolctl==3.1.0
tiktoken==0.3.3
tls-client==0.1.8
Expand All @@ -166,18 +165,19 @@ torchvision==0.13.1 ; sys_platform == "darwin"
torch==1.12.1+cu113 ; sys_platform == "linux"
torchvision==0.13.1+cu113 ; sys_platform == "linux"
tqdm==4.64.1
transformers==4.28.1
transformers==4.33.1
trio==0.22.0
trio-websocket==0.9.2
typer==0.4.2
types-Pillow==9.3.0.4
types-pytz==2022.4.0.0
types-redis==4.3.21.1
types-requests==2.28.11.2
types-tabulate==0.9.0.0
types-urllib3==1.26.25
typing==3.7.4.3
typing_extensions==4.4.0
uncertainty-calibration==0.1.3
uncertainty-calibration==0.1.4
undetected-chromedriver==3.2.1
uritemplate==4.1.1
urllib3==1.26.12
Expand Down
Loading

0 comments on commit bcd4dc7

Please sign in to comment.