Merge pull request #96 from stanford-crfm/dev

Release v1.0
stanford-crfm · Sep 9, 2021 · 7be4c58 · 7be4c58
2 parents c4683c4 + 2d54234
commit 7be4c58
Show file tree

Hide file tree

Showing 35 changed files with 1,117 additions and 58 deletions.
diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml
@@ -0,0 +1,39 @@
+
+name: Run Tests
+on: [push]
+jobs:
+  Run-Mistral-Tests:
+    runs-on: self-hosted
+    steps:
+      - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
+      - run: echo "🐧 This job is now running on a ${{ runner.os }} server hosted by GitHub!"
+      - run: echo "🔎 The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}."
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - run: echo "💡 The ${{ github.repository }} repository has been cloned to the runner."
+      - run: echo "🖥️ The workflow is now ready to test your code on the runner."
+      - name: Setup
+        run: |
+          cp -r /home/stanzabuild/mistral/wandb .
+          wandb offline
+      - name: Tests for arguments (single node/single GPU)
+        if: always()
+        run: |
+          cd tests
+          CUDA_VISIBLE_DEVICES=0 pytest test_args.py
+      - name: Tests for checkpoints (single node/single GPU)
+        if: always()
+        run: |
+          cd tests
+          CUDA_VISIBLE_DEVICES=0 pytest test_checkpoint.py
+      - name: Tests for upcasting (single node/single GPU)
+        if: always()
+        run: |
+          cd tests
+          CUDA_VISIBLE_DEVICES=0 pytest test_fp.py
+      - name: Tests for random seed (single node/single GPU)
+        if: always()
+        run: |
+          cd tests
+          CUDA_VISIBLE_DEVICES=0 pytest test_seed.py
+      - run: echo "🍏 This job's status is ${{ job.status }}."
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ fail_fast: true
 
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
+    rev: v4.0.1
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
@@ -17,17 +17,22 @@ repos:
     -   id: check-added-large-files
 
 -   repo: https://github.com/psf/black
-    rev: 20.8b1
+    rev: 21.8b0
     hooks:
     -   id: black
 
 -   repo: https://github.com/timothycrosley/isort
-    rev: 5.6.4
+    rev: 5.9.3
     hooks:
     -   id: isort
 
 -   repo: https://gitlab.com/pycqa/flake8
-    rev: 3.8.4
+    rev: 3.9.2
     hooks:
     -   id: flake8
         additional_dependencies: [flake8-isort]
+
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: 'v0.910'
+    hooks:
+    -   id: mypy
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-green?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 
 A framework for transparent and accessible large-scale language model training, built with [Hugging Face 🤗](https://huggingface.co/) . Includes tools
-and helpful scripts for incorporating new pre-training datasets, various schemes for single node and distributed training - including on 
+and helpful scripts for incorporating new pre-training datasets, various schemes for single node and distributed training - including on
 cloud providers like GCP, and importantly, scripts for evaluation.
 
 Visit our [Read the Docs](https://nlp.stanford.edu/mistral) for the full documentation.
@@ -143,8 +143,8 @@ We have also stored over 600 checkpoints for each model, subject to the followin
 - Every 100 Steps, from 2000 - 20,000 Steps.
 - Every 1000 Steps, from 20,000 - 400,000 Steps.
 
-This comes out to _610 checkpoints per run, taking up ~22TB for all 10 models_ (making it pretty expensive to host!) If you are interested in acquiring 
-these additional checkpoints, please [file an issue](https://github.com/stanford-crfm/mistral/issues) or contact Laurel (lorr1) and Sidd (skaramcheti) 
+This comes out to _610 checkpoints per run, taking up ~22TB for all 10 models_ (making it pretty expensive to host!) If you are interested in acquiring
+these additional checkpoints, please [file an issue](https://github.com/stanford-crfm/mistral/issues) or contact Laurel (lorr1) and Sidd (skaramcheti)
 at their @cs.stanford.edu email addresses, and we'll be happy to figure out a cost-effective solution to sharing them.
 
 GPT-2 Medium
@@ -201,11 +201,18 @@ GPT-2 Small
 
 ## Issues
 
-To ask questions, report issues, or request features, please use the [GitHub Issue Tracker](https://github.com/stanford-crfm/mistral/issues). 
+To ask questions, report issues, or request features, please use the [GitHub Issue Tracker](https://github.com/stanford-crfm/mistral/issues).
 Before creating a new issue, please make sure to search for existing issues that may solve your problem.
 
 ---
 
+## Differences between Mistral and Hugging Face
+
+Please visit the [following page](https://nlp.stanford.edu/mistral/hugging_face_differences.html) that outlines the
+differences between the two codebases.
+
+---
+
 ## Contributing
 
 Please see the [following page](https://nlp.stanford.edu/mistral/contributing.html) for information on contributing.
diff --git a/conf/train_schema.py b/conf/train_schema.py
@@ -21,7 +21,7 @@
 
 
 def get_schema() -> Dict[str, Any]:
-    """ Get the Cerberus schema for the Quinine config used in train.py. """
+    """Get the Cerberus schema for the Quinine config used in train.py."""
 
     # Schema for Dataset
     data_schema = {

diff --git a/docs/README.md b/docs/README.md
@@ -4,6 +4,7 @@ If you don't already have Sphinx set up install it with `pip`.
 
 ```bash
 pip install sphinx
+pip install sphinx-rtd-theme
 ```
 
 The documentation has been built with version 4.0.2.

diff --git a/docs/conf.py b/docs/conf.py
@@ -12,27 +12,29 @@
 # Problems with imports? Could try `export PYTHONPATH=$PYTHONPATH:`pwd`` from root project dir...
 import os
 import sys
-sys.path.insert(0, os.path.abspath('..'))  # Source code dir relative to this file
+
+
+sys.path.insert(0, os.path.abspath(".."))  # Source code dir relative to this file
 
 # -- Project information -----------------------------------------------------
 
-project = 'Mistral'
-author = 'Project Mercury'
-copyright = '2021 The Board of Trustees of The Leland Stanford Junior University'
+project = "Mistral"
+author = "Project Mercury"
+copyright = "2021 The Board of Trustees of The Leland Stanford Junior University"
 
 # The full version, including alpha/beta/rc tags
-release = '0.1.0'
+release = "0.1.0"
 
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',  # Core Sphinx library for auto html doc generation from docstrings
-    'sphinx.ext.autosummary',  # Create neat summary tables for modules/classes/methods etc
-    'sphinx.ext.intersphinx',  # Link to other project's documentation (see mapping below)
-    'sphinx.ext.viewcode'  # Add a link to the Python source code for classes, functions etc.
+    "sphinx.ext.autodoc",  # Core Sphinx library for auto html doc generation from docstrings
+    "sphinx.ext.autosummary",  # Create neat summary tables for modules/classes/methods etc
+    "sphinx.ext.intersphinx",  # Link to other project's documentation (see mapping below)
+    "sphinx.ext.viewcode",  # Add a link to the Python source code for classes, functions etc.
 ]
 
 # Mappings for sphinx.ext.intersphinx. Projects have to have Sphinx-generated doc! (.inv file)
@@ -46,20 +48,20 @@
 autodoc_inherit_docstrings = True  # If no docstring, inherit from base class
 set_type_checking_flag = True  # Enable 'expensive' imports for sphinx_autodoc_typehints
 nbsphinx_allow_errors = True  # Continue through Jupyter errors
-#autodoc_typehints = "description" # Sphinx-native method. Not as good as sphinx_autodoc_typehints
-add_module_names = False # Remove namespaces from class/method signatures
+# autodoc_typehints = "description" # Sphinx-native method. Not as good as sphinx_autodoc_typehints
+add_module_names = False  # Remove namespaces from class/method signatures
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # Exclusions
 # To exclude a module, use autodoc_mock_imports. Note this may increase build time, a lot.
 # (Also, when installing on readthedocs.org, we omit installing Tensorflow and
 # Tensorflow Probability so mock them here instead.)
-#autodoc_mock_imports = [
-    # 'tensorflow',
-    # 'tensorflow_probability',
-#]
+# autodoc_mock_imports = [
+# 'tensorflow',
+# 'tensorflow_probability',
+# ]
 # To exclude a class, function, method or attribute, use autodoc-skip-member. (Note this can also
 # be used in reverse, ie. to re-include a particular member that has been excluded.)
 # 'Private' and 'special' members (_ and __) are excluded using the Jinja2 templates; from the main
@@ -88,17 +90,18 @@
 on_rtd = os.environ.get("READTHEDOCS", None) == "True"
 if not on_rtd:  # only import and set the theme if we're building docs locally
     import sphinx_rtd_theme
+
     html_theme = "sphinx_rtd_theme"
     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-html_css_files = ["readthedocs-custom.css"] # Override some CSS settings
+html_css_files = ["readthedocs-custom.css"]  # Override some CSS settings
 
 # Pydata theme
-#html_theme = "pydata_sphinx_theme"
-#html_logo = "_static/logo-company.png"
-#html_theme_options = { "show_prev_next": False}
-#html_css_files = ['pydata-custom.css']
+# html_theme = "pydata_sphinx_theme"
+# html_logo = "_static/logo-company.png"
+# html_theme_options = { "show_prev_next": False}
+# html_css_files = ['pydata-custom.css']
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
diff --git a/docs/hugging_face_differences.rst b/docs/hugging_face_differences.rst
@@ -0,0 +1,36 @@
+Differences between Mistral and Hugging Face
+===============
+
+Mistral is not a replacement for Hugging Face. Rather, we extend the current functionalities in Hugging Face
+by fixing stability issues with GPT training, adding evaluation scripts and supporting distributed training
+with the DeepSpeed optimization library.
+
+
+**Stability**
+
+When training GPT-2 Small models with Hugging Face, some of the models crashed due to numerical instability.
+We fixed the this issue by rearranging the order of operations in scaled dot-product attention computation
+and upcasting to FP32. We also scaled down the weights by dividing by the layer number to prevent overflow.
+
+
+**Evaluation**
+
+We added online evaluation so we can get PPL on arbitrary datasets while training.
+
+
+**Parallelism**
+
+We noticed that integrating parallelism (e.g. tensor model-parallelism and pipelining) breaks the current
+Hugging Face APIs.
+
+
+**Distributed Training**
+
+We provide ready-to-use scripts and configuration files to run distributed training with DeepSpeed,
+Google Cloud Platform and Kubernetes.
+
+
+**Future**
+
+We are closely working with folks from Hugging Face. We plan to integrate Mistral into the Hugging Face library
+in the future
diff --git a/docs/index.rst b/docs/index.rst
@@ -27,6 +27,7 @@
 
    Contributing <contributing>
    API reference <_autosummary/src>
+   Differences between Mistral and Hugging Face <hugging_face_differences>
 
 Mistral - Large Scale Language Modeling Made Easy
 =====================================================

diff --git a/environments/environment-cpu.yaml b/environments/environment-cpu.yaml
@@ -174,6 +174,7 @@ dependencies:
   - pycodestyle==2.6.0
   - pyflakes==2.2.0
   - pylatex==1.4.1
+  - pytest==6.2.5
   - pytz==2021.1
   - pyyaml==5.4
   - quinine==0.3.0

diff --git a/environments/environment-gpu.yaml b/environments/environment-gpu.yaml
@@ -183,6 +183,7 @@ dependencies:
   - pycodestyle==2.6.0
   - pyflakes==2.2.0
   - pylatex==1.4.1
+  - pytest==6.2.5
   - pytz==2021.1
   - pyyaml==5.4
   - quinine==0.3.0

diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,31 @@
+[mypy]
+disable_error_code=override
+
+# do not follow imports (except for ones found in typeshed)
+ignore_missing_imports = True
+#Ignore errors for third parties
+ignore_errors = True
+follow_imports = silent
+
+# treat Optional per PEP 484
+strict_optional = False
+
+warn_unused_configs = True
+warn_redundant_casts = True
+# ensure all execution paths are returning
+warn_no_return= True
+warn_unreachable = True
+allow_redefinition = True
+
+show_error_codes = True
+check_untyped_defs = True
+
+
+files=
+    src,
+    tests,
+    train.py
+python_version = 3.6
+
+[mypy-src.*]
+ignore_errors = False
diff --git a/src/args/training_args.py b/src/args/training_args.py
@@ -26,7 +26,7 @@ def get_training_arguments(
     nodes: int = 1,
     gpus_per_node: int = 8,
 ) -> TrainingArguments:
-    """ Initialize Training Arguments from Quinfig and Runtime-Defined Variables. """
+    """Initialize Training Arguments from Quinfig and Runtime-Defined Variables."""
 
     # `quinfig_args` already contains some default training arguments --> we'll be overwriting/adding to the Dict
     #   =>> a `Munch` is a subclass of Dictionary that supports attribute style access

diff --git a/src/core/callbacks.py b/src/core/callbacks.py
@@ -8,7 +8,7 @@
 import os
 import time
 from bisect import bisect_left
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import jsonlines
 import torch
@@ -41,7 +41,7 @@ def rewrite_logs(d: Dict[str, float]) -> Dict[str, float]:
 
 
 class CustomWandbCallback(WandbCallback):
-    """ Custom Weights and Biases Callback used by Mistral for logging information from the Huggingface Trainer. """
+    """Custom Weights and Biases Callback used by Mistral for logging information from the Huggingface Trainer."""
 
     def __init__(
         self,
@@ -70,14 +70,15 @@ def __init__(
         self.group, self.resume, self.resume_run_id, self.wandb_dir = group, resume, resume_run_id, wandb_dir
 
         # Timers
-        self.within_time, self.between_time = None, None
+        self.within_time: Optional[float] = None
+        self.between_time: Optional[float] = None
 
     def _append_jsonl(self, data) -> None:
         with jsonlines.open(self.json_file, mode="a") as writer:
             writer.write(data)
 
     def _log_memory(self, state, prefix="train_info"):
-        """ Simple method to log memory usage at the end of every training batch. """
+        """Simple method to log memory usage at the end of every training batch."""
         if state.is_world_process_zero and torch.cuda.is_available():
             memory_usage = {
                 f"{prefix}/memory_allocated": torch.cuda.memory_allocated() / 2 ** 20,
@@ -254,7 +255,7 @@ def on_train_begin(
         eval_dataloader=None,
         **kwargs,
     ):
-        """ Calls wandb.init, we add additional arguments to that call using this method. """
+        """Calls wandb.init, we add additional arguments to that call using this method."""
 
         # Pass in additional keyword arguments to the wandb.init call as kwargs
         super().on_train_begin(
@@ -325,7 +326,7 @@ def on_log(
 
 
 class CustomCheckpointCallback(TrainerCallback):
-    """ Custom Checkpoint Callback used by Mistral for Saving Checkpoints at different frequencies. """
+    """Custom Checkpoint Callback used by Mistral for Saving Checkpoints at different frequencies."""
 
     def __init__(self, frequencies: List[List[int]]):
         super(CustomCheckpointCallback, self).__init__()
@@ -337,7 +338,7 @@ def __init__(self, frequencies: List[List[int]]):
         assert all(i < j for i, j in zip(self.until, self.until[1:])), "Frequency `until_step` not increasing!"
 
     def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
-        """ Borrow Checkpoint Logic from `DefaultFlowCallback` to decide when to checkpoint. """
+        """Borrow Checkpoint Logic from `DefaultFlowCallback` to decide when to checkpoint."""
 
         # Save (note we explicitly save checkpoint-0 in `train.py`, so no need to do it here)
         c = state.global_step