diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml index 9ead54d9a..bc4106b84 100644 --- a/.github/workflows/inference_cache_llm.yml +++ b/.github/workflows/inference_cache_llm.yml @@ -39,7 +39,7 @@ jobs: EOF wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - sudo apt-get update -y - sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e -y + sudo apt-get install aws-neuronx-tools=2.18.3.0 aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f aws-neuronx-collectives=2.21.46.0-69b77134b -y export PATH=/opt/aws/neuron/bin:$PATH - name: Checkout uses: actions/checkout@v4 diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index 312d1a91c..81ff32c81 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -387,6 +387,7 @@ def get_submodels_for_export_stable_diffusion( text_encoder_2 = getattr(pipeline, "text_encoder_2", None) if text_encoder_2 is not None: text_encoder_2.config.output_hidden_states = True + text_encoder_2.text_model.config.output_hidden_states = True models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, copy.deepcopy(text_encoder_2))) # U-NET diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py index 3d0935cc4..6edd4fd1c 100644 --- a/optimum/neuron/generation/token_selector.py +++ b/optimum/neuron/generation/token_selector.py @@ -92,6 +92,7 @@ def create( """ generation_config.validate() generation_config = copy.deepcopy(generation_config) + model._prepare_special_tokens(generation_config) unsupported_generation_flags = [ "output_attentions", diff --git a/pyproject.toml b/pyproject.toml index 572fda1d3..01d30af5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ line-length = 119 # Never enforce `E501` (line length violations). ignore = ["C901", "E501", "E741", "W605"] select = ["C", "E", "F", "I", "W"] +exclude = ["*.ipynb"] # Ignore import violations in all `__init__.py` files. [tool.ruff.lint.per-file-ignores] diff --git a/setup.py b/setup.py index 63febc0fd..f2e770e51 100644 --- a/setup.py +++ b/setup.py @@ -13,9 +13,9 @@ INSTALL_REQUIRES = [ - "transformers == 4.41.1", + "transformers == 4.43.2", "accelerate == 0.29.2", - "optimum ~= 1.20.0", + "optimum ~= 1.21.0", "huggingface_hub >= 0.20.1", "numpy>=1.22.2, <=1.25.2", "protobuf<4", diff --git a/tests/cache/test_neuronx_cache.py b/tests/cache/test_neuronx_cache.py index a5f3c3886..684e3b6fe 100644 --- a/tests/cache/test_neuronx_cache.py +++ b/tests/cache/test_neuronx_cache.py @@ -34,14 +34,14 @@ ) from optimum.neuron.utils import get_hub_cached_entries, synchronize_hub_cache from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx -from optimum.utils.testing_utils import TOKEN @pytest.fixture -def cache_repos(): +def cache_repos(staging): # Setup: create temporary Hub repository and local cache directory - api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) - user = api.whoami()["name"] + token = staging["token"] + user = staging["user"] + api = HfApi(endpoint=ENDPOINT_STAGING, token=token) hostname = socket.gethostname() cache_repo_id = f"{user}/{hostname}-optimum-neuron-cache" if api.repo_exists(cache_repo_id): @@ -57,7 +57,7 @@ def cache_repos(): os.environ["NEURON_COMPILE_CACHE_URL"] = cache_path os.environ["CUSTOM_CACHE_REPO"] = cache_repo_id os.environ["HF_ENDPOINT"] = ENDPOINT_STAGING - os.environ["HF_TOKEN"] = TOKEN + os.environ["HF_TOKEN"] = token yield (cache_path, cache_repo_id) # Teardown api.delete_repo(cache_repo_id) @@ -173,7 +173,8 @@ def check_traced_cache_entry(cache_path): def assert_local_and_hub_cache_sync(cache_path, cache_repo_id): - api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) + # Since created models are public on the staging endpoint we don't need a token + api = HfApi(endpoint=ENDPOINT_STAGING) remote_files = api.list_repo_files(cache_repo_id) local_files = get_local_cached_files(cache_path) for file in local_files: diff --git a/tests/conftest.py b/tests/conftest.py index 8062756a5..539136409 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,7 @@ set_neuron_cache_path, ) -from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, get_random_string +from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, TOKEN_STAGING, USER_STAGING, get_random_string # Inferentia fixtures @@ -171,3 +171,19 @@ def pytest_fixture_setup(fixturedef, request): if getattr(fixturedef.func, "is_dist_fixture", False): dist_fixture_class = fixturedef.func() dist_fixture_class(request) + + +@pytest.fixture +def staging(): + """A pytest fixture only available in huggingface_hub staging mode + + If the huggingface_hub is not operating in staging mode, tests using + that fixture are automatically skipped. + + Returns: + a Dict containing a valid staging user and token. + """ + return { + "user": USER_STAGING, + "token": TOKEN_STAGING, + } diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py index c50c3c72c..b8346a7fb 100644 --- a/tests/decoder/conftest.py +++ b/tests/decoder/conftest.py @@ -58,8 +58,7 @@ def _export_model(model_id, export_kwargs, neuron_model_path): try: subprocess.run(export_command, check=True) except subprocess.CalledProcessError as e: - logger.error(f"Failed to export model: {e}") - return + raise SystemError(f"Failed to export model: {e}") @pytest.fixture(scope="session", params=DECODER_MODEL_CONFIGURATIONS.keys()) diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py index 7e372ad9a..d94e0f0e6 100644 --- a/tests/generation/test_hub.py +++ b/tests/generation/test_hub.py @@ -13,39 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import os -import re from huggingface_hub import HfApi from transformers.testing_utils import ENDPOINT_STAGING from optimum.neuron import NeuronModelForSeq2SeqLM from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx -from optimum.utils.testing_utils import TOKEN, USER - - -def _test_push_to_hub(model, model_path, repo_id, ignore_patterns=[]): - model.push_to_hub(model_path, repo_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING) - api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN) - try: - hub_files_path = api.list_repo_files(repo_id) - for path, _, files in os.walk(model_path): - for name in files: - local_file_path = os.path.join(path, name) - hub_file_path = os.path.relpath(local_file_path, model_path) - excluded = False - for pattern in ignore_patterns: - if re.compile(pattern).match(hub_file_path) is not None: - excluded = True - break - assert excluded or hub_file_path in hub_files_path - finally: - api.delete_repo(repo_id) - - -def neuron_push_model_id(model_id): - model_name = model_id.split("/")[-1] - repo_id = f"{USER}/{model_name}-neuronx" - return repo_id @is_inferentia_test @@ -59,6 +32,18 @@ def test_seq2seq_model_from_hub(): @is_inferentia_test @requires_neuronx -def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id): +def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, staging): model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path) - _test_push_to_hub(model, neuron_seq2seq_greedy_path, neuron_push_seq2seq_id) + model.push_to_hub( + neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, use_auth_token=staging.token, endpoint=ENDPOINT_STAGING + ) + api = HfApi(endpoint=ENDPOINT_STAGING, token=staging.token) + try: + hub_files_path = api.list_repo_files(neuron_push_seq2seq_id) + for path, _, files in os.walk(neuron_seq2seq_greedy_path): + for name in files: + local_file_path = os.path.join(path, name) + hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_greedy_path) + assert hub_file_path in hub_files_path + finally: + api.delete_repo(neuron_push_seq2seq_id) diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py index 9e7d45370..0effb7387 100644 --- a/tests/test_cache_utils.py +++ b/tests/test_cache_utils.py @@ -37,9 +37,8 @@ set_neuron_cache_path, ) from optimum.neuron.utils.testing_utils import is_trainium_test -from optimum.utils.testing_utils import TOKEN, USER -from .utils import StagingTestMixin, TrainiumTestMixin, get_random_string +from .utils import TOKEN_STAGING, USER_STAGING, StagingTestMixin, TrainiumTestMixin, get_random_string DUMMY_COMPILER_VERSION = "1.2.3" @@ -147,10 +146,10 @@ def test_list_files_in_neuron_cache(self): class StagingNeuronUtilsTestCase(StagingTestMixin, TestCase): def test_set_custom_cache_repo_name_in_hf_home(self): orig_token = get_token() - login(TOKEN) + login(TOKEN_STAGING) repo_name = f"blablabla-{self.seed}" - repo_id = f"{USER}/{repo_name}" + repo_id = f"{USER_STAGING}/{repo_name}" create_repo(repo_name, repo_type="model") def remove_repo(): diff --git a/tests/utils.py b/tests/utils.py index 7bd6c279b..060c77596 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -58,11 +58,15 @@ from optimum.neuron.utils.patching import DynamicPatch, Patcher from optimum.neuron.utils.require_utils import requires_neuronx_distributed from optimum.utils import logging -from optimum.utils.testing_utils import TOKEN, USER logger = logging.get_logger(__name__) + +# Not critical, only usable on the sandboxed CI instance. +USER_STAGING = "__DUMMY_OPTIMUM_USER__" +TOKEN_STAGING = "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" + SEED = 42 OPTIMUM_INTERNAL_TESTING_CACHE_REPO = "optimum-internal-testing/optimum-neuron-cache-for-testing" @@ -450,7 +454,7 @@ def tearDownClass(cls): class StagingTestMixin: CUSTOM_CACHE_REPO_NAME = "optimum-neuron-cache-testing" - CUSTOM_CACHE_REPO = f"{USER}/{CUSTOM_CACHE_REPO_NAME}" + CUSTOM_CACHE_REPO = f"{USER_STAGING}/{CUSTOM_CACHE_REPO_NAME}" CUSTOM_PRIVATE_CACHE_REPO = f"{CUSTOM_CACHE_REPO}-private" _token = "" MAX_NUM_LINEARS = 20 @@ -468,8 +472,8 @@ def set_hf_hub_token(cls, token: Optional[str]) -> Optional[str]: @classmethod def setUpClass(cls): - cls._staging_token = TOKEN - cls._token = cls.set_hf_hub_token(TOKEN) + cls._staging_token = TOKEN_STAGING + cls._token = cls.set_hf_hub_token(TOKEN_STAGING) cls._custom_cache_repo_name = load_custom_cache_repo_name_from_hf_home() delete_custom_cache_repo_name_from_hf_home() @@ -511,6 +515,6 @@ def remove_all_files_in_repo(self, repo_id: str): pass def tearDown(self): - login(TOKEN) + login(TOKEN_STAGING) self.remove_all_files_in_repo(self.CUSTOM_CACHE_REPO) self.remove_all_files_in_repo(self.CUSTOM_PRIVATE_CACHE_REPO) diff --git a/text-generation-inference/tests/fixtures/model.py b/text-generation-inference/tests/fixtures/model.py index c94d45784..b1e785308 100644 --- a/text-generation-inference/tests/fixtures/model.py +++ b/text-generation-inference/tests/fixtures/model.py @@ -54,8 +54,7 @@ def export_model(model_id, export_kwargs, neuron_model_path): try: subprocess.run(export_command, check=True) except subprocess.CalledProcessError as e: - logger.error(f"Failed to export model: {e}") - return + raise ValueError(f"Failed to export model: {e}") @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())