diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml
index 9ead54d9a..bc4106b84 100644
--- a/.github/workflows/inference_cache_llm.yml
+++ b/.github/workflows/inference_cache_llm.yml
@@ -39,7 +39,7 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.18.3.0 aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f aws-neuronx-collectives=2.21.46.0-69b77134b  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index 312d1a91c..81ff32c81 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -387,6 +387,7 @@ def get_submodels_for_export_stable_diffusion(
     text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
     if text_encoder_2 is not None:
         text_encoder_2.config.output_hidden_states = True
+        text_encoder_2.text_model.config.output_hidden_states = True
         models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, copy.deepcopy(text_encoder_2)))
 
     # U-NET
diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py
index 3d0935cc4..6edd4fd1c 100644
--- a/optimum/neuron/generation/token_selector.py
+++ b/optimum/neuron/generation/token_selector.py
@@ -92,6 +92,7 @@ def create(
         """
         generation_config.validate()
         generation_config = copy.deepcopy(generation_config)
+        model._prepare_special_tokens(generation_config)
 
         unsupported_generation_flags = [
             "output_attentions",
diff --git a/pyproject.toml b/pyproject.toml
index 572fda1d3..01d30af5b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ line-length = 119
 # Never enforce `E501` (line length violations).
 ignore = ["C901", "E501", "E741", "W605"]
 select = ["C", "E", "F", "I", "W"]
+exclude = ["*.ipynb"]
 
 # Ignore import violations in all `__init__.py` files.
 [tool.ruff.lint.per-file-ignores]
diff --git a/setup.py b/setup.py
index 63febc0fd..f2e770e51 100644
--- a/setup.py
+++ b/setup.py
@@ -13,9 +13,9 @@
 
 
 INSTALL_REQUIRES = [
-    "transformers == 4.41.1",
+    "transformers == 4.43.2",
     "accelerate == 0.29.2",
-    "optimum ~= 1.20.0",
+    "optimum ~= 1.21.0",
     "huggingface_hub >= 0.20.1",
     "numpy>=1.22.2, <=1.25.2",
     "protobuf<4",
diff --git a/tests/cache/test_neuronx_cache.py b/tests/cache/test_neuronx_cache.py
index a5f3c3886..684e3b6fe 100644
--- a/tests/cache/test_neuronx_cache.py
+++ b/tests/cache/test_neuronx_cache.py
@@ -34,14 +34,14 @@
 )
 from optimum.neuron.utils import get_hub_cached_entries, synchronize_hub_cache
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
-from optimum.utils.testing_utils import TOKEN
 
 
 @pytest.fixture
-def cache_repos():
+def cache_repos(staging):
     # Setup: create temporary Hub repository and local cache directory
-    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
-    user = api.whoami()["name"]
+    token = staging["token"]
+    user = staging["user"]
+    api = HfApi(endpoint=ENDPOINT_STAGING, token=token)
     hostname = socket.gethostname()
     cache_repo_id = f"{user}/{hostname}-optimum-neuron-cache"
     if api.repo_exists(cache_repo_id):
@@ -57,7 +57,7 @@ def cache_repos():
     os.environ["NEURON_COMPILE_CACHE_URL"] = cache_path
     os.environ["CUSTOM_CACHE_REPO"] = cache_repo_id
     os.environ["HF_ENDPOINT"] = ENDPOINT_STAGING
-    os.environ["HF_TOKEN"] = TOKEN
+    os.environ["HF_TOKEN"] = token
     yield (cache_path, cache_repo_id)
     # Teardown
     api.delete_repo(cache_repo_id)
@@ -173,7 +173,8 @@ def check_traced_cache_entry(cache_path):
 
 
 def assert_local_and_hub_cache_sync(cache_path, cache_repo_id):
-    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
+    # Since created models are public on the staging endpoint we don't need a token
+    api = HfApi(endpoint=ENDPOINT_STAGING)
     remote_files = api.list_repo_files(cache_repo_id)
     local_files = get_local_cached_files(cache_path)
     for file in local_files:
diff --git a/tests/conftest.py b/tests/conftest.py
index 8062756a5..539136409 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,7 +26,7 @@
     set_neuron_cache_path,
 )
 
-from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, get_random_string
+from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, TOKEN_STAGING, USER_STAGING, get_random_string
 
 
 # Inferentia fixtures
@@ -171,3 +171,19 @@ def pytest_fixture_setup(fixturedef, request):
     if getattr(fixturedef.func, "is_dist_fixture", False):
         dist_fixture_class = fixturedef.func()
         dist_fixture_class(request)
+
+
+@pytest.fixture
+def staging():
+    """A pytest fixture only available in huggingface_hub staging mode
+
+    If the huggingface_hub is not operating in staging mode, tests using
+    that fixture are automatically skipped.
+
+    Returns:
+        a Dict containing a valid staging user and token.
+    """
+    return {
+        "user": USER_STAGING,
+        "token": TOKEN_STAGING,
+    }
diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py
index c50c3c72c..b8346a7fb 100644
--- a/tests/decoder/conftest.py
+++ b/tests/decoder/conftest.py
@@ -58,8 +58,7 @@ def _export_model(model_id, export_kwargs, neuron_model_path):
     try:
         subprocess.run(export_command, check=True)
     except subprocess.CalledProcessError as e:
-        logger.error(f"Failed to export model: {e}")
-        return
+        raise SystemError(f"Failed to export model: {e}")
 
 
 @pytest.fixture(scope="session", params=DECODER_MODEL_CONFIGURATIONS.keys())
diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py
index 7e372ad9a..d94e0f0e6 100644
--- a/tests/generation/test_hub.py
+++ b/tests/generation/test_hub.py
@@ -13,39 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-import re
 
 from huggingface_hub import HfApi
 from transformers.testing_utils import ENDPOINT_STAGING
 
 from optimum.neuron import NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
-from optimum.utils.testing_utils import TOKEN, USER
-
-
-def _test_push_to_hub(model, model_path, repo_id, ignore_patterns=[]):
-    model.push_to_hub(model_path, repo_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
-    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
-    try:
-        hub_files_path = api.list_repo_files(repo_id)
-        for path, _, files in os.walk(model_path):
-            for name in files:
-                local_file_path = os.path.join(path, name)
-                hub_file_path = os.path.relpath(local_file_path, model_path)
-                excluded = False
-                for pattern in ignore_patterns:
-                    if re.compile(pattern).match(hub_file_path) is not None:
-                        excluded = True
-                        break
-                assert excluded or hub_file_path in hub_files_path
-    finally:
-        api.delete_repo(repo_id)
-
-
-def neuron_push_model_id(model_id):
-    model_name = model_id.split("/")[-1]
-    repo_id = f"{USER}/{model_name}-neuronx"
-    return repo_id
 
 
 @is_inferentia_test
@@ -59,6 +32,18 @@ def test_seq2seq_model_from_hub():
 
 @is_inferentia_test
 @requires_neuronx
-def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id):
+def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, staging):
     model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path)
-    _test_push_to_hub(model, neuron_seq2seq_greedy_path, neuron_push_seq2seq_id)
+    model.push_to_hub(
+        neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, use_auth_token=staging.token, endpoint=ENDPOINT_STAGING
+    )
+    api = HfApi(endpoint=ENDPOINT_STAGING, token=staging.token)
+    try:
+        hub_files_path = api.list_repo_files(neuron_push_seq2seq_id)
+        for path, _, files in os.walk(neuron_seq2seq_greedy_path):
+            for name in files:
+                local_file_path = os.path.join(path, name)
+                hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_greedy_path)
+                assert hub_file_path in hub_files_path
+    finally:
+        api.delete_repo(neuron_push_seq2seq_id)
diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py
index 9e7d45370..0effb7387 100644
--- a/tests/test_cache_utils.py
+++ b/tests/test_cache_utils.py
@@ -37,9 +37,8 @@
     set_neuron_cache_path,
 )
 from optimum.neuron.utils.testing_utils import is_trainium_test
-from optimum.utils.testing_utils import TOKEN, USER
 
-from .utils import StagingTestMixin, TrainiumTestMixin, get_random_string
+from .utils import TOKEN_STAGING, USER_STAGING, StagingTestMixin, TrainiumTestMixin, get_random_string
 
 
 DUMMY_COMPILER_VERSION = "1.2.3"
@@ -147,10 +146,10 @@ def test_list_files_in_neuron_cache(self):
 class StagingNeuronUtilsTestCase(StagingTestMixin, TestCase):
     def test_set_custom_cache_repo_name_in_hf_home(self):
         orig_token = get_token()
-        login(TOKEN)
+        login(TOKEN_STAGING)
 
         repo_name = f"blablabla-{self.seed}"
-        repo_id = f"{USER}/{repo_name}"
+        repo_id = f"{USER_STAGING}/{repo_name}"
         create_repo(repo_name, repo_type="model")
 
         def remove_repo():
diff --git a/tests/utils.py b/tests/utils.py
index 7bd6c279b..060c77596 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -58,11 +58,15 @@
 from optimum.neuron.utils.patching import DynamicPatch, Patcher
 from optimum.neuron.utils.require_utils import requires_neuronx_distributed
 from optimum.utils import logging
-from optimum.utils.testing_utils import TOKEN, USER
 
 
 logger = logging.get_logger(__name__)
 
+
+# Not critical, only usable on the sandboxed CI instance.
+USER_STAGING = "__DUMMY_OPTIMUM_USER__"
+TOKEN_STAGING = "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"
+
 SEED = 42
 OPTIMUM_INTERNAL_TESTING_CACHE_REPO = "optimum-internal-testing/optimum-neuron-cache-for-testing"
 
@@ -450,7 +454,7 @@ def tearDownClass(cls):
 
 class StagingTestMixin:
     CUSTOM_CACHE_REPO_NAME = "optimum-neuron-cache-testing"
-    CUSTOM_CACHE_REPO = f"{USER}/{CUSTOM_CACHE_REPO_NAME}"
+    CUSTOM_CACHE_REPO = f"{USER_STAGING}/{CUSTOM_CACHE_REPO_NAME}"
     CUSTOM_PRIVATE_CACHE_REPO = f"{CUSTOM_CACHE_REPO}-private"
     _token = ""
     MAX_NUM_LINEARS = 20
@@ -468,8 +472,8 @@ def set_hf_hub_token(cls, token: Optional[str]) -> Optional[str]:
 
     @classmethod
     def setUpClass(cls):
-        cls._staging_token = TOKEN
-        cls._token = cls.set_hf_hub_token(TOKEN)
+        cls._staging_token = TOKEN_STAGING
+        cls._token = cls.set_hf_hub_token(TOKEN_STAGING)
         cls._custom_cache_repo_name = load_custom_cache_repo_name_from_hf_home()
         delete_custom_cache_repo_name_from_hf_home()
 
@@ -511,6 +515,6 @@ def remove_all_files_in_repo(self, repo_id: str):
             pass
 
     def tearDown(self):
-        login(TOKEN)
+        login(TOKEN_STAGING)
         self.remove_all_files_in_repo(self.CUSTOM_CACHE_REPO)
         self.remove_all_files_in_repo(self.CUSTOM_PRIVATE_CACHE_REPO)
diff --git a/text-generation-inference/tests/fixtures/model.py b/text-generation-inference/tests/fixtures/model.py
index c94d45784..b1e785308 100644
--- a/text-generation-inference/tests/fixtures/model.py
+++ b/text-generation-inference/tests/fixtures/model.py
@@ -54,8 +54,7 @@ def export_model(model_id, export_kwargs, neuron_model_path):
     try:
         subprocess.run(export_command, check=True)
     except subprocess.CalledProcessError as e:
-        logger.error(f"Failed to export model: {e}")
-        return
+        raise ValueError(f"Failed to export model: {e}")
 
 
 @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())