Add fixtures (#673)

Add fixtures for testing boilerplate, tiny mpt models, and tiny finetune dataset
mosaicml · Oct 25, 2023 · ea3279a · ea3279a
1 parent bc687b7
commit ea3279a
Show file tree

Hide file tree

Showing 14 changed files with 272 additions and 243 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,12 +1,10 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-import gc
 import os
 from typing import List, Optional
 
 import pytest
-import torch
 from composer.utils import reproducibility
 
 # Allowed options for pytest.mark.world_size()
@@ -18,6 +16,13 @@
 # Enforce deterministic mode before any tests start.
 reproducibility.configure_deterministic_mode()
 
+# Add the path of any pytest fixture files you want to make global
+pytest_plugins = [
+    'tests.fixtures.autouse',
+    'tests.fixtures.models',
+    'tests.fixtures.data',
+]
+
 
 def _add_option(parser: pytest.Parser,
                 name: str,
@@ -78,12 +83,3 @@ def pytest_collection_modifyitems(config: pytest.Config,
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
     if exitstatus == 5:
         session.exitstatus = 0  # Ignore no-test-ran errors
-
-
-@pytest.fixture(autouse=True)
-def clear_cuda_cache(request: pytest.FixtureRequest):
-    """Clear memory between GPU tests."""
-    marker = request.node.get_closest_marker('gpu')
-    if marker is not None and torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
diff --git a/tests/fixtures/__init__.py b/tests/fixtures/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py
@@ -0,0 +1,39 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import gc
+
+import pytest
+import torch
+from composer.utils import dist, get_device, reproducibility
+
+
+@pytest.fixture(autouse=True)
+def initialize_dist(request: pytest.FixtureRequest):
+    """Initialize the default PyTorch distributed process group for tests."""
+    # should we just always initialize dist like in train.py?
+    _default = pytest.mark.world_size(1).mark
+    world_size = request.node.get_closest_marker('world_size', _default).args[0]
+    gpu = request.node.get_closest_marker('gpu')
+    if world_size > 1:
+        dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu'))
+
+
+@pytest.fixture(autouse=True)
+def clear_cuda_cache(request: pytest.FixtureRequest):
+    """Clear memory between GPU tests."""
+    marker = request.node.get_closest_marker('gpu')
+    if marker is not None and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
+
+
+@pytest.fixture
+def random_seed() -> int:
+    return 17
+
+
+@pytest.fixture(autouse=True)
+def seed_all(random_seed: int):
+    """Sets the seed for reproducibility."""
+    reproducibility.seed_all(random_seed)
diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py
@@ -0,0 +1,58 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+
+from composer.utils import dist
+from omegaconf import DictConfig
+from pytest import fixture
+from torch.utils.data import DataLoader
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
+from tests.data_utils import make_tiny_ft_dataset
+
+
+@fixture
+def tiny_ft_dataset_path(tmp_path: Path, dataset_size: int = 4) -> Path:
+    """Creates a tiny dataset and returns the path."""
+    tiny_dataset_path = tmp_path / 'test-ift-data-small'
+    tiny_dataset_path.mkdir(exist_ok=True)
+    tiny_dataset_file = tiny_dataset_path / 'train.jsonl'
+    if dist.get_world_size() == 1 or dist.get_global_rank() == 0:
+        make_tiny_ft_dataset(path=str(tiny_dataset_file), size=dataset_size)
+    return tiny_dataset_path
+
+
+@fixture
+def tiny_ft_dataloader(tiny_ft_dataset_path: Path,
+                       mpt_tokenizer: PreTrainedTokenizerBase,
+                       max_seq_len: int = 128,
+                       device_batch_size: int = 1) -> DataLoader:
+    dataloader_cfg = DictConfig({
+        'name': 'finetuning',
+        'dataset': {
+            'hf_name': str(tiny_ft_dataset_path),
+            'split': 'train',
+            'max_seq_len': max_seq_len,
+            'decoder_only_format': True,
+            'allow_pad_trimming': False,
+            'packing_ratio': None,
+            'shuffle': True,
+        },
+        'drop_last': False,
+        'num_workers': 4,
+        'pin_memory': False,
+        'prefetch_factor': 2,
+        'persistent_workers': False,
+        'timeout': 0
+    })
+
+    dataloader = build_finetuning_dataloader(
+        dataloader_cfg,
+        mpt_tokenizer,
+        device_batch_size,
+    ).dataloader
+
+    assert isinstance(dataloader, DataLoader)
+    return dataloader
diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py
@@ -0,0 +1,70 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable
+
+from omegaconf import DictConfig
+from pytest import fixture
+from transformers import PreTrainedTokenizerBase
+
+from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
+from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+from llmfoundry.utils.builders import build_tokenizer
+
+
+def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase):
+    model = COMPOSER_MODEL_REGISTRY[config.name](config, tokenizer)
+    return model
+
+
+@fixture
+def mpt_tokenizer():
+    return build_tokenizer('EleutherAI/gpt-neox-20b', {})
+
+
+@fixture
+def build_tiny_mpt(
+    mpt_tokenizer: PreTrainedTokenizerBase
+) -> Callable[..., ComposerMPTCausalLM]:
+
+    def build(**kwargs: Any) -> ComposerMPTCausalLM:
+        config = DictConfig({
+            'name': 'mpt_causal_lm',
+            'd_model': 128,
+            'n_heads': 4,
+            'n_layers': 2,
+            'expansion_ratio': 2,
+        })
+        config.update(kwargs)
+        model = _build_model(config, mpt_tokenizer)
+        assert isinstance(model, ComposerMPTCausalLM)
+        return model
+
+    return build
+
+
+@fixture
+def build_tiny_hf_mpt(
+    mpt_tokenizer: PreTrainedTokenizerBase
+) -> Callable[..., ComposerHFCausalLM]:
+
+    def build(**kwargs: Any) -> ComposerHFCausalLM:
+        config_overrides = {
+            'd_model': 128,
+            'n_heads': 4,
+            'n_layers': 2,
+            'expansion_ratio': 2,
+        }
+        config_overrides.update(kwargs)
+        config = DictConfig({
+            'name': 'hf_causal_lm',
+            'pretrained_model_name_or_path': 'mosaicml/mpt-7b',
+            'pretrained': False,
+            'config_overrides': config_overrides,
+        })
+        model = _build_model(config, mpt_tokenizer)
+        assert isinstance(model, ComposerHFCausalLM)
+        return model
+
+    return build
diff --git a/tests/test_data_prep_scripts.py b/tests/test_data_prep_scripts.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import shutil
 import sys
 from argparse import Namespace
+from pathlib import Path
 
 # Add repo root to path so we can import scripts and test it
 repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
@@ -13,17 +13,16 @@
 from scripts.data_prep.convert_dataset_json import main as main_json
 
 
-def test_download_script_from_api():
+def test_download_script_from_api(tmp_path: Path):
     # test calling it directly
-    path = os.path.join(os.getcwd(), 'my-copy-c4-1')
-    shutil.rmtree(path, ignore_errors=True)
+    path = os.path.join(tmp_path, 'my-copy-c4-1')
     main_hf(
         Namespace(
             **{
                 'dataset': 'c4',
                 'data_subset': 'en',
                 'splits': ['val_xsmall'],
-                'out_root': './my-copy-c4-1',
+                'out_root': path,
                 'compression': None,
                 'concat_tokens': None,
                 'bos_text': None,
@@ -32,18 +31,16 @@ def test_download_script_from_api():
                 'num_workers': None
             }))
     assert os.path.exists(path)
-    shutil.rmtree(path, ignore_errors=False)
 
 
-def test_json_script_from_api():
+def test_json_script_from_api(tmp_path: Path):
     # test calling it directly
-    path = os.path.join(os.getcwd(), 'my-copy-arxiv-1')
-    shutil.rmtree(path, ignore_errors=True)
+    path = os.path.join(tmp_path, 'my-copy-arxiv-1')
     main_json(
         Namespace(
             **{
                 'path': 'scripts/data_prep/example_data/arxiv.jsonl',
-                'out_root': './my-copy-arxiv-1',
+                'out_root': path,
                 'compression': None,
                 'split': 'train',
                 'concat_tokens': None,
@@ -53,4 +50,3 @@ def test_json_script_from_api():
                 'num_workers': None
             }))
     assert os.path.exists(path)
-    shutil.rmtree(path, ignore_errors=False)
diff --git a/tests/test_flash_triton_torch.py b/tests/test_flash_triton_torch.py
@@ -3,7 +3,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import OmegaConf as om
 
 
@@ -39,8 +38,6 @@ def test_attn_impl(attn_impl_0: str,
     if alibi and (attn_impl_0 == 'flash' or attn_impl_1 == 'flash'):
         pytest.xfail('flash attn does not support alibi')
 
-    reproducibility.seed_all(7)
-
     cfg = om.create({
         'attn_impl': 'flash',
         'd_model': 128,
@@ -135,8 +132,6 @@ def test_vs_mha(attn_impl: str, device: str = 'cuda'):
     """Compare diff attn_impl to torch.nn.MultiheadAttention."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
@@ -234,8 +229,6 @@ def test_grouped_attention_heads(attn_impl: str,
     """Ensure grouped_query_attention runs w/ diff n_heads & kv_n_heads."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,
@@ -273,8 +266,6 @@ def test_grouped_query_invalid_heads(attn_impl: str, device: str = 'cuda'):
     """Check indivisble combinations of grouped_query_attention."""
     from llmfoundry.models.layers import attention
 
-    reproducibility.seed_all(17)
-
     cfg = om.create({
         'attn_impl': attn_impl,
         'd_model': 256,

diff --git a/tests/test_hf_config.py b/tests/test_hf_config.py
@@ -9,7 +9,6 @@
 
 import pytest
 import torch
-from composer.utils import reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from transformers import AutoModelForCausalLM
@@ -93,8 +92,6 @@ def test_hf_config_override(
     with open(conf_path) as f:
         test_cfg = om.load(f)
 
-    reproducibility.seed_all(test_cfg.seed)
-
     # Build Model
     # For fast initialization, use `meta` device
     print('Initializing model...')