From 269ded6d074410845cba08d495b4e54dd673d1cf Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 17 Nov 2023 23:07:06 -0800 Subject: [PATCH] Reenable tests that were accidentally disabled (#746) --- tests/test_model.py | 187 ++++++++++++++++++-------------------------- 1 file changed, 75 insertions(+), 112 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 3308c65fd3..c160c064dc 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -304,17 +304,13 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): assert not torch.equal(original_params, updated_params) +@pytest.mark.gpu @pytest.mark.parametrize( 'attn_impl,precision', [('torch', torch.float16), ('torch', torch.bfloat16), pytest.param('flash', torch.float16, marks=pytest.mark.gpu), pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)]) def test_determinism(attn_impl: str, precision: torch.dtype): - if not torch.cuda.is_available(): - pytest.skip( - 'This test requires CUDA to be available in order to run with bfloat16 precision.' - ) - conf_path = 'scripts/train/yamls/pretrain/testing.yaml' with open(conf_path) as f: test_cfg = om.load(f) @@ -519,10 +515,12 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): assert block.resid_ffn_dropout.p == 0.2 -@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu')]) +@pytest.mark.parametrize('attention_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu) +]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, 'rope': False @@ -550,24 +548,20 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_padding(attention_impl: str, device: str, - pos_emb_config: dict, tie_word_embeddings: bool): +def test_forward_with_padding(attention_impl: str, pos_emb_config: dict, + tie_word_embeddings: bool): # Test that different placement of padding does not affect the output. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) alibi = pos_emb_config['alibi'] if alibi and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') rope = pos_emb_config['rope'] - if rope and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if rope and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -743,12 +737,12 @@ def test_advanced_mask_building(attention_impl: str): assert torch.equal(attn_bias, expected_attn_bias) -@pytest.mark.parametrize('attention_impl,device,precision', [ - ('torch', 'cpu', 'fp32'), - ('flash', 'gpu', 'amp_bf16'), - ('triton', 'gpu', 'amp_bf16'), - ('torch', 'gpu', 'amp_bf16'), - ('torch', 'gpu', 'fp32'), +@pytest.mark.parametrize('attention_impl,precision', [ + ('torch', 'fp32'), + pytest.param('flash', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('triton', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('torch', 'amp_bf16', marks=pytest.mark.gpu), + pytest.param('torch', 'fp32', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -777,25 +771,21 @@ def test_advanced_mask_building(attention_impl: str): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate(attention_impl: str, device: str, precision: str, - pos_emb_config: dict, tie_word_embeddings: bool): +def test_generate(attention_impl: str, precision: str, pos_emb_config: dict, + tie_word_embeddings: bool): # Test that generate works, and produces the same output with or without # padding in the input. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) if pos_emb_config['alibi'] and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False: pytest.skip(f'This test configuration has precision / sampling issues.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -878,8 +868,6 @@ def test_generate(attention_impl: str, device: str, precision: str, @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int, use_cache: bool, tie_word_embeddings: bool): - if not torch.cuda.is_available(): - pytest.skip(f'This test requires CUDA to be available.') if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.') @@ -978,11 +966,11 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): check_hf_model_equivalence(mpt, mpt2) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1011,22 +999,17 @@ def test_save_from_pretrained(tmp_path: pathlib.Path): }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache_and_padding(attn_impl: str, device: str, - pos_emb_config: dict, +def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): # Tests that the result is the same with or without padding when using kv caching - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1120,11 +1103,11 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str, rtol=1e-6) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1153,23 +1136,19 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, +def test_forward_with_cache(attn_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): # Test that model forward with and without the key-value cache produces the # same output. - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1264,11 +1243,11 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, ) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1297,20 +1276,16 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generate_with_past_kv(attn_impl: str, device: str, - pos_emb_config: dict, tie_word_embeddings: bool): - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) +def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict, + tie_word_embeddings: bool): if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) hf_config = MPTConfig( init_device='cpu', @@ -1368,11 +1343,11 @@ def test_generate_with_past_kv(attn_impl: str, device: str, hf_config.d_model) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('generation_kwargs', [{ 'max_new_tokens': 2, @@ -1412,24 +1387,22 @@ def test_generate_with_past_kv(attn_impl: str, device: str, }, }]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) -def test_generation_kwargs_dont_crash(attn_impl: str, device: str, +def test_generation_kwargs_dont_crash(attn_impl: str, generation_kwargs: Dict[str, Any], pos_emb_config: dict, tie_word_embeddings: bool): - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) - if device == 'gpu': # Switch deteminism off + composer_device = get_device(None) + + if composer_device.name == 'gpu': torch.use_deterministic_algorithms(False) + hf_config = MPTConfig( init_device='cpu', d_model=128, @@ -1463,7 +1436,8 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str, _ = mpt.generate(input_ids=no_padding_input_ids, attention_mask=no_padding_attention_mask, **generation_kwargs) - if device == 'gpu': # Switch deteminism back on + + if composer_device.name == 'gpu': reproducibility.configure_deterministic_mode() @@ -1499,10 +1473,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str, def test_model_to(attention_impl: str, pos_emb_config: dict, tie_word_embeddings: bool): # test that moving the model to diff devices and dtypes in diff ways does not break the model - if not torch.cuda.is_available(): - pytest.skip( - f'This test requires CUDA to be available in order to run with {attention_impl} attention.' - ) if pos_emb_config['alibi'] and attention_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') @@ -1597,11 +1567,11 @@ def test_alibi_vs_hf(): torch.testing.assert_close(alibi_bias_hf, alibi_bias_m) -@pytest.mark.parametrize('attn_impl,device', [ - ('torch', 'cpu'), - ('flash', 'gpu'), - ('triton', 'gpu'), - ('torch', 'gpu'), +@pytest.mark.parametrize('attn_impl', [ + 'torch', + pytest.param('flash', marks=pytest.mark.gpu), + pytest.param('triton', marks=pytest.mark.gpu), + pytest.param('torch', marks=pytest.mark.gpu), ]) @pytest.mark.parametrize('pos_emb_config', [{ 'alibi': False, @@ -1633,24 +1603,19 @@ def test_alibi_vs_hf(): @pytest.mark.parametrize('output_hidden_states', [True, False]) @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_forward_with_output_attentions_and_output_hidden_states( - attn_impl: str, device: str, pos_emb_config: dict, - output_attentions: bool, output_hidden_states: bool, - tie_word_embeddings: bool): + attn_impl: str, pos_emb_config: dict, output_attentions: bool, + output_hidden_states: bool, tie_word_embeddings: bool): # Test that model forward with output_attentions_and_output_hidden_states - if not torch.cuda.is_available() and device == 'gpu': - pytest.skip( - f'This test requires CUDA to be available in order to run with {attn_impl} attention.' - ) if pos_emb_config['alibi'] and attn_impl == 'flash': pytest.skip(f'alibi only implemented with torch and triton attention.') if output_attentions and attn_impl in ['flash', 'triton']: pytest.skip(f'output_attentions only implemented with torch attention.') - if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and ( - device != 'gpu' or not is_flash_v2_installed()): + if pos_emb_config['rope'] and pos_emb_config[ + 'rope_impl'] == 'dail' and not is_flash_v2_installed(): pytest.skip( f'dail implementation of rope requires gpu and flash attention 2.') - composer_device = get_device(device) + composer_device = get_device(None) n_layers = 2 @@ -1708,8 +1673,6 @@ def test_hf_init(tmp_path: pathlib.Path, init_device: str, world_size: int, batch_size: int = 1): - if not torch.cuda.is_available(): - pytest.skip(f'This test requires CUDA to be available.') if not torch.cuda.device_count() >= world_size: pytest.skip(f'This test requires {world_size} GPUs.')