diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/latency.png b/docs/assets/benchmarks/inferentia-llama2-13b/latency.png
deleted file mode 100644
index 9f43062dc..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama2-13b/latency.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png
deleted file mode 100644
index bfd5fc07b..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png
deleted file mode 100644
index 2dca870b0..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png
deleted file mode 100644
index 8b26732b6..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png
deleted file mode 100644
index a1fe59446..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png
deleted file mode 100644
index ec7a219db..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png
deleted file mode 100644
index e00192997..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png
deleted file mode 100644
index 8dc1c23dc..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png
deleted file mode 100644
index a76e48020..000000000
Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-llama3.1-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3.1-8b/latency.png
new file mode 100644
index 000000000..bb86b6b2e
Binary files /dev/null and b/docs/assets/benchmarks/inferentia-llama3.1-8b/latency.png differ
diff --git a/docs/assets/benchmarks/inferentia-llama3.1-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3.1-8b/throughput.png
new file mode 100644
index 000000000..c657a1471
Binary files /dev/null and b/docs/assets/benchmarks/inferentia-llama3.1-8b/throughput.png differ
diff --git a/docs/assets/benchmarks/inferentia-llama3.1-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3.1-8b/ttft.png
new file mode 100644
index 000000000..afd45542f
Binary files /dev/null and b/docs/assets/benchmarks/inferentia-llama3.1-8b/ttft.png differ
diff --git a/docs/assets/benchmarks/inferentia-mistral-small/latency.png b/docs/assets/benchmarks/inferentia-mistral-small/latency.png
new file mode 100644
index 000000000..00ba008cd
Binary files /dev/null and b/docs/assets/benchmarks/inferentia-mistral-small/latency.png differ
diff --git a/docs/assets/benchmarks/inferentia-mistral-small/throughput.png b/docs/assets/benchmarks/inferentia-mistral-small/throughput.png
new file mode 100644
index 000000000..dcbae4696
Binary files /dev/null and b/docs/assets/benchmarks/inferentia-mistral-small/throughput.png differ
diff --git a/docs/assets/benchmarks/inferentia-mistral-small/ttft.png b/docs/assets/benchmarks/inferentia-mistral-small/ttft.png
new file mode 100644
index 000000000..04dc928c7
Binary files /dev/null and b/docs/assets/benchmarks/inferentia-mistral-small/ttft.png differ
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/latency.png b/docs/assets/benchmarks/inferentia-mistral-v2/latency.png
deleted file mode 100644
index a02ce879a..000000000
Binary files a/docs/assets/benchmarks/inferentia-mistral-v2/latency.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png b/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png
deleted file mode 100644
index ad940c54d..000000000
Binary files a/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png and /dev/null differ
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png b/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png
deleted file mode 100644
index 6069cc5bc..000000000
Binary files a/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png and /dev/null differ
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 0573fafdb..935556f75 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -46,14 +46,10 @@
       title: NeuronX Text-generation-inference for AWS inferentia2
     title: How-To Guides
   - sections:
-    - local: benchmarks/inferentia-llama2-7b
-      title: Llama2 7b on AWS Inferentia2
-    - local: benchmarks/inferentia-llama2-13b
-      title: Llama2 13b on AWS Inferentia2
-    - local: benchmarks/inferentia-mistral-v2
-      title: Mistral v0.2 7b on AWS Inferentia2
-    - local: benchmarks/inferentia-llama3-8b
-      title: Llama-3 8B on AWS Inferentia2
+    - local: benchmarks/inferentia-mistral-small
+      title: Mistral Small on AWS Inferentia2
+    - local: benchmarks/inferentia-llama3.1-8b
+      title: Llama-3.1 8B on AWS Inferentia2
     title: Benchmarks
   - sections:
     - local: community/contributing
diff --git a/docs/source/benchmarks/inferentia-llama2-13b.mdx b/docs/source/benchmarks/inferentia-llama2-13b.mdx
deleted file mode 100644
index d268bffb8..000000000
--- a/docs/source/benchmarks/inferentia-llama2-13b.mdx
+++ /dev/null
@@ -1,60 +0,0 @@
-<!---
-Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Llama-2-13b performance on AWS Inferentia2 (Latency & Througput)
-
-How fast is Llama-2-13b on Inferentia2?  Let's figure out!
-
-For this benchmark we will use the following configurations:
-
-| Model type      | batch_size | sequence_length |
-|-----------------|------------|-----------------|
-| Llama2 13B BS1  | 1          | 4096            |
-| Llama2 13B BS4  | 4          | 4096            |
-| Llama2 13B BS8  | 8          | 4096            |
-| Llama2 13B BS16 | 16         | 4096            |
-
-*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
-
-*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
-
-## Time to first token
-
-The time to first token is the time required to process the input tokens and generate the first output token.
-It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.
-
-We test the time to first token for increasing context sizes, from a typical Q/A usage, to heavy Retrieval Augmented Generation (RAG) use-cases.
-
-Time to first token is expressed in **seconds**.
-
-![Llama2 13b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png "Time to first token")
-
-## Inter-token Latency
-
-The inter-token latency corresponds to the average time elapsed between two generated tokens.
-
-It is expressed in **milliseconds**.
-
-![Llama2 13b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-13b/latency.png "Inter-token latency")
-
-### Throughput
-
-Unlike some other benchmarks, we evaluate the throughput using generated tokens only, by dividing their number
-by the end-to-end latency.
-
-Throughput is expressed in **tokens/second**.
-
-![Llama2 13b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png "Throughput")
diff --git a/docs/source/benchmarks/inferentia-llama3-8b.mdx b/docs/source/benchmarks/inferentia-llama3.1-8b.mdx
similarity index 62%
rename from docs/source/benchmarks/inferentia-llama3-8b.mdx
rename to docs/source/benchmarks/inferentia-llama3.1-8b.mdx
index fd471d8ef..d2d8c17b6 100644
--- a/docs/source/benchmarks/inferentia-llama3-8b.mdx
+++ b/docs/source/benchmarks/inferentia-llama3.1-8b.mdx
@@ -14,19 +14,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Llama-3-8b performance on AWS Inferentia2 (Latency & Througput)
+# Llama-3.1-8b performance on AWS Inferentia2 (Latency & Througput)
 
-How fast is Llama-3-8b on Inferentia2?  Let's figure out!
+How fast is Llama-3.1-8b on Inferentia2?  Let's figure out!
 
 For this benchmark we will use the following configurations:
 
-| Model type     | batch_size | sequence_length |
-|----------------|------------|-----------------|
-| Llama3 8b BS1  | 1          | 4096            |
-| Llama3 8b BS4  | 4          | 4096            |
-| Llama3 8b BS8  | 8          | 4096            |
-| Llama3 8b BS16 | 16         | 4096            |
-| Llama3 8b BS32 | 32         | 4096            |
+| Model type       | batch_size | sequence_length |
+|------------------|------------|-----------------|
+| Llama3.1 8b BS1  | 1          | 4096            |
+| Llama3.1 8b BS4  | 4          | 4096            |
+| Llama3.1 8b BS8  | 8          | 4096            |
+| Llama3.1 8b BS16 | 16         | 4096            |
+| Llama3.1 8b BS32 | 32         | 4096            |
 
 *Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
 
@@ -41,7 +41,7 @@ We test the time to first token for increasing context sizes, from a typical Q/A
 
 Time to first token is expressed in **seconds**.
 
-![Llama3 8b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png "Time to first token")
+![Llama3.1 8b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3.1-8b/ttft.png "Time to first token")
 
 ## Inter-token Latency
 
@@ -49,7 +49,7 @@ The inter-token latency corresponds to the average time elapsed between two gene
 
 It is expressed in **milliseconds**.
 
-![Llama3 8b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/latency.png "Inter-token latency")
+![Llama3.1 8b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3.1-8b/latency.png "Inter-token latency")
 
 ### Throughput
 
@@ -58,4 +58,4 @@ by the end-to-end latency.
 
 Throughput is expressed in **tokens/second**.
 
-![Llama3 8b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png "Throughput")
+![Llama3.1 8b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama3.1-8b/throughput.png "Throughput")
diff --git a/docs/source/benchmarks/inferentia-llama2-7b.mdx b/docs/source/benchmarks/inferentia-mistral-small.mdx
similarity index 63%
rename from docs/source/benchmarks/inferentia-llama2-7b.mdx
rename to docs/source/benchmarks/inferentia-mistral-small.mdx
index 2a9de6508..99325a048 100644
--- a/docs/source/benchmarks/inferentia-llama2-7b.mdx
+++ b/docs/source/benchmarks/inferentia-mistral-small.mdx
@@ -14,19 +14,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Llama-2-7b performance on AWS Inferentia2 (Latency & Througput)
+# Mistral-Small-Instruct performance on AWS Inferentia2 (Latency & Througput)
 
-How fast is Llama-2-7b on Inferentia2?  Let's figure out!
+How fast is Mistral on Inferentia2?  Let's figure out!
 
 For this benchmark we will use the following configurations:
 
-| Model type     | batch_size | sequence_length |
-|----------------|------------|-----------------|
-| Llama2 7B BS1  | 1          | 4096            |
-| Llama2 7B BS4  | 4          | 4096            |
-| Llama2 7B BS8  | 8          | 4096            |
-| Llama2 7B BS16 | 16         | 4096            |
-| Llama2 7B BS32 | 24         | 4096            |
+| Model type         | batch_size | sequence_length |
+|--------------------|------------|-----------------|
+| Mistral-Small BS1  | 1          | 4096            |
+| Mistral-Small BS4  | 4          | 4096            |
 
 *Note: all models are compiled to use 6 devices corresponding to 12 cores on the `inf2.48xlarge` instance.*
 
@@ -41,7 +38,7 @@ We test the time to first token for increasing context sizes, from a typical Q/A
 
 Time to first token is expressed in **seconds**.
 
-![Llama2 7b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png "Time to first token")
+![Mistral Small inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-small/ttft.png "Time to first token")
 
 ## Inter-token Latency
 
@@ -49,7 +46,7 @@ The inter-token latency corresponds to the average time elapsed between two gene
 
 It is expressed in **milliseconds**.
 
-![Llama2 7b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/latency.png "Inter-token latency")
+![Mistral Small inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-small/latency.png "Inter-token latency")
 
 ### Throughput
 
@@ -58,4 +55,4 @@ by the end-to-end latency.
 
 Throughput is expressed in **tokens/second**.
 
-![Llama2 7b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png "Throughput")
+![Mistral Small inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-small/throughput.png "Throughput")
diff --git a/docs/source/benchmarks/inferentia-mistral-v2.mdx b/docs/source/benchmarks/inferentia-mistral-v2.mdx
deleted file mode 100644
index 3ac3ab738..000000000
--- a/docs/source/benchmarks/inferentia-mistral-v2.mdx
+++ /dev/null
@@ -1,61 +0,0 @@
-<!---
-Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-# Mistral-7b-Instruct-v0.2 performance on AWS Inferentia2 (Latency & Througput)
-
-How fast is Mistralv0.2 on Inferentia2?  Let's figure out!
-
-For this benchmark we will use the following configurations:
-
-| Model type      | batch_size | sequence_length |
-|-----------------|------------|-----------------|
-| Mistral 7B BS1  | 1          | 4096            |
-| Mistral 7B BS4  | 4          | 4096            |
-| Mistral 7B BS8  | 8          | 4096            |
-| Mistral 7B BS16 | 16         | 4096            |
-| Mistral 7B BS32 | 32         | 4096            |
-
-*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
-
-*Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
-
-## Time to first token
-
-The time to first token is the time required to process the input tokens and generate the first output token.
-It is a very important metric, as it corresponds to the latency directly perceived by the user when streaming generated tokens.
-
-We test the time to first token for increasing context sizes, from a typical Q/A usage, to heavy Retrieval Augmented Generation (RAG) use-cases.
-
-Time to first token is expressed in **seconds**.
-
-![Mistral 7b inferentia2 TTFT](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png "Time to first token")
-
-## Inter-token Latency
-
-The inter-token latency corresponds to the average time elapsed between two generated tokens.
-
-It is expressed in **milliseconds**.
-
-![Mistral 7b inferentia2 inter-token latency](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-v2/latency.png "Inter-token latency")
-
-### Throughput
-
-Unlike some other benchmarks, we evaluate the throughput using generated tokens only, by dividing their number
-by the end-to-end latency.
-
-Throughput is expressed in **tokens/second**.
-
-![Mistral 7b inferentia2 throughput](https://raw.githubusercontent.com/huggingface/optimum-neuron/main/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png "Throughput")