diff --git a/Makefile b/Makefile
index 3fe53135d..1ae0aa514 100644
--- a/Makefile
+++ b/Makefile
@@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
-TGI_VERSION ?= 2.0.2
+TGI_VERSION ?= 2.1.1
 
 neuronx-tgi: $(PACKAGE_DIST)
 	docker build --rm -f text-generation-inference/Dockerfile \
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 597a3bea7..09b8f1e80 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -8,7 +8,7 @@ RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
 # Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
-FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79-bookworm AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -20,8 +20,6 @@ COPY --from=tgi /tgi/proto proto
 COPY --from=tgi /tgi/benchmark benchmark
 COPY --from=tgi /tgi/router router
 COPY --from=tgi /tgi/launcher launcher
-# Remove the next line when bumping rust version
-RUN cargo update ravif --precise 0.11.6
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
@@ -41,6 +39,8 @@ COPY --from=tgi /tgi/proto proto
 COPY --from=tgi /tgi/benchmark benchmark
 COPY --from=tgi /tgi/router router
 COPY --from=tgi /tgi/launcher launcher
+# Remove this line once TGI has fixed the conflict
+RUN cargo update ureq --precise 2.9.7
 RUN cargo build --release --workspace --exclude benchmark
 
 # Python base image
diff --git a/text-generation-inference/server/text_generation_server/cli.py b/text-generation-inference/server/text_generation_server/cli.py
index 5aa9cceb6..59c09125c 100644
--- a/text-generation-inference/server/text_generation_server/cli.py
+++ b/text-generation-inference/server/text_generation_server/cli.py
@@ -17,6 +17,9 @@ def serve(
     uds_path: str = "/tmp/text-generation-server",
     logger_level: str = "INFO",
     json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
+    max_input_tokens: Optional[int] = None,
 ):
     """This is the main entry-point for the server CLI.
 
@@ -36,6 +39,12 @@ def serve(
             The server logger level. Defaults to *INFO*.
         json_output (`bool`):
             Use JSON format for log serialization.
+        otlp_endpoint (`Optional[str]`, defaults to `None`):
+            The Open Telemetry endpoint to use.
+        otlp_service_name (`Optional[str]`, defaults to `None`):
+            The name to use when pushing data to the Open Telemetry endpoint.
+        max_input_tokens (`Optional[int]`, defaults to `None`):
+            The maximum number of input tokens each request should contain.
     """
     if sharded:
         raise ValueError("Sharding is not supported.")
diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py
index 9be650c98..54eb7d10a 100644
--- a/text-generation-inference/server/text_generation_server/generator.py
+++ b/text-generation-inference/server/text_generation_server/generator.py
@@ -472,8 +472,18 @@ def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBa
         # just carry on with decoding. We adopt the id of the first
         # batch in the list as our next batch id.
         next_batch_id = batches[0].id
+        request_ids = []
+        for batch in batches:
+            request_ids += batch.request_ids
+        cleared_request_ids = []
+        for slot in self.slots:
+            if slot.state == slot.State.READY and slot.request_id not in request_ids:
+                cleared_request_ids.append(slot.request_id)
+                slot.clear()
+        if len(cleared_request_ids) > 0:
+            logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
         active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
-        if len(active_slots) == 0:
+        if len(active_slots) < len(request_ids):
             raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
         if self.model.continuous_batching:
             decode_slots = active_slots
diff --git a/text-generation-inference/tests/integration/test_implicit_env.py b/text-generation-inference/tests/integration/test_implicit_env.py
index ec1708d50..bb090d10c 100644
--- a/text-generation-inference/tests/integration/test_implicit_env.py
+++ b/text-generation-inference/tests/integration/test_implicit_env.py
@@ -17,8 +17,8 @@ async def tgi_service(request, launcher, neuron_model_config):
     # the tgi_env.py script will take care of setting these
     for var in [
         "MAX_BATCH_SIZE",
-        "MAX_INPUT_LENGTH",
-        "MAX_TOTAL_TOKEN",
+        "MAX_INPUT_TOKENS",
+        "MAX_TOTAL_TOKENS",
         "HF_NUM_CORES",
         "HF_AUTO_CAST_TYPE",
     ]:
diff --git a/text-generation-inference/tgi_env.py b/text-generation-inference/tgi_env.py
index 4584358ae..5c3ea9141 100755
--- a/text-generation-inference/tgi_env.py
+++ b/text-generation-inference/tgi_env.py
@@ -16,7 +16,7 @@
 
 logger = logging.getLogger(__name__)
 
-tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_LENGTH"]
+tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS"]
 tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
 
 env_config_peering = [
@@ -38,7 +38,9 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
     if not argv:
         argv = sys.argv
     # All these are params passed to tgi and intercepted here
-    parser.add_argument("--max-input-length", type=int, default=os.getenv("MAX_INPUT_LENGTH", 0))
+    parser.add_argument(
+        "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+    )
     parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
     parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
     parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
@@ -57,8 +59,8 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
     if args.max_total_tokens > 0:
         os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens)
 
-    if args.max_input_length > 0:
-        os.environ["MAX_INPUT_LENGTH"] = str(args.max_input_length)
+    if args.max_input_tokens > 0:
+        os.environ["MAX_INPUT_TOKENS"] = str(args.max_input_tokens)
 
     if args.max_batch_size > 0:
         os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size)
@@ -73,12 +75,12 @@ def neuron_config_to_env(neuron_config):
     with open(os.environ["ENV_FILEPATH"], "w") as f:
         for env_var, config_key in env_config_peering:
             f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
-        max_input_length = os.getenv("MAX_INPUT_LENGTH")
-        if not max_input_length:
-            max_input_length = int(neuron_config["sequence_length"]) // 2
-            if max_input_length == 0:
+        max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
+        if not max_input_tokens:
+            max_input_tokens = int(neuron_config["sequence_length"]) // 2
+            if max_input_tokens == 0:
                 raise Exception("Model sequence length should be greater than 1")
-        f.write("export MAX_INPUT_LENGTH={}\n".format(max_input_length))
+        f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
 
 
 def sort_neuron_configs(dictionary):
@@ -149,13 +151,13 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
             )
             return False
 
-    if os.getenv("MAX_INPUT_LENGTH"):
-        max_input_length = int(os.environ["MAX_INPUT_LENGTH"])
+    max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
+    if max_input_tokens > 0:
         sequence_length = neuron_config["sequence_length"]
-        if max_input_length >= sequence_length:
+        if max_input_tokens >= sequence_length:
             logger.debug(
-                "Specified max input length is not compatible with config sequence length " "( %s >= %s)",
-                max_input_length,
+                "Specified max input tokens is not compatible with config sequence length " "( %s >= %s)",
+                max_input_tokens,
                 sequence_length,
             )
             return False