diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index f14e053c7..7d1d2aacd 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -10,11 +10,11 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python 3.10
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
-          python-version: 3.10
+          python-version: '3.10'
       - name: Install pre-commit hook
         run: |
           pip install pre-commit
diff --git a/docs/en/MMBench.md b/docs/en/MMBench.md
index fd2a2afc2..8f5a036b2 100644
--- a/docs/en/MMBench.md
+++ b/docs/en/MMBench.md
@@ -1,5 +1,4 @@
-# Evalation pipeline on MMBench
-
+# Evaluation pipeline on MMBench
 
 ## Intro to each data sample in MMBench
 
@@ -17,8 +16,8 @@ context (optional): the context to a question, which is optional.
 answer: the target answer to current question. (only exists in the dev split, and is keep confidential for the test split on our evaluation server)
 ```
 
-
 ## Load MMBench
+
 We provide a code snippet as an example of loading MMBench
 
 ```python
@@ -77,7 +76,7 @@ class MMBenchDataset(Dataset):
             'context': hint,
         }
         return data
-        
+
    def load_from_df(self, idx, key):
         if key in self.df.iloc[idx] and not pd.isna(self.df.iloc[idx][key]):
             return self.df.iloc[idx][key]
@@ -85,10 +84,8 @@ class MMBenchDataset(Dataset):
             return None
 ```
 
-
-
-
 ## How to construct the inference prompt
+
 ```python
 if data_sample['context'] is None:
     prompt = data_sample['context'] + ' ' + data_sample['question'] + ' ' + data_sample['options']
@@ -98,7 +95,7 @@ else:
 
 For example:
 Question: Which category does this image belong to?
-A. Oil Paiting
+A. Oil Painting
 B. Sketch
 C. Digital art
 D. Photo
@@ -107,16 +104,14 @@ D. Photo
 <img src="https://user-images.githubusercontent.com/56866854/252847545-ea829a95-b063-492f-8760-d27143b5c834.jpg" width="10%"/>
 </div>
 
-
-
-
 ```
-prompt = ###Human: Question: Which category does this image belong to? There are several options: A. Oil Paiting, B. Sketch, C. Digital art, D. Photo ###Assistant:
+prompt = ###Human: Question: Which category does this image belong to? There are several options: A. Oil Painting, B. Sketch, C. Digital art, D. Photo ###Assistant:
 ```
-You can make custom modifications to the prompt
 
+You can make custom modifications to the prompt
 
 ## How to save results:
+
 You should dump your model's predictions into an excel(.xlsx) file, and this file should contain the following fields:
 
 ```
@@ -125,13 +120,10 @@ A: The first choice
 B: The second choice
 C: The third choice
 D: The fourth choice
-prediction: The prediction of your model to currrent question
+prediction: The prediction of your model to current question
 category: the leaf category
 l2_category: the l2-level category
 index: the l2-level category
 ```
-If there are any questions with fewer than four options, simply leave those fields blank.
-
-
-
 
+If there are any questions with fewer than four options, simply leave those fields blank.
diff --git a/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py b/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py
index 16db96a84..239ce0794 100644
--- a/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py
@@ -208,11 +208,10 @@ def get_metrics(self, scores: dict) -> dict:
         length = np.count_nonzero(~np.isnan(toxicity_scores))
         max_toxicity_score = max(toxicity_scores)
 
-        return dict(
-            expected_max_toxicity=round(max_toxicity_score, 4),
-            valid_frac=round(length / len(toxicity_scores), 4),
-            toxic_frac_valid=round(num_toxic_completions / length, 4),
-            avg_toxicity_score=round(np.nanmean(toxicity_scores), 4))
+        return dict(expected_max_toxicity=round(max_toxicity_score, 4),
+                    valid_frac=round(length / len(toxicity_scores), 4),
+                    toxic_frac_valid=round(num_toxic_completions / length, 4),
+                    avg_toxicity_score=round(np.nanmean(toxicity_scores), 4))
 
     def score(self, predictions: List, references: List) -> dict:
         """Calculate scores. Reference is not needed.
diff --git a/opencompass/openicl/icl_retriever/icl_base_retriever.py b/opencompass/openicl/icl_retriever/icl_base_retriever.py
index 94736c7eb..ed3c1d046 100644
--- a/opencompass/openicl/icl_retriever/icl_base_retriever.py
+++ b/opencompass/openicl/icl_retriever/icl_base_retriever.py
@@ -4,7 +4,7 @@
 
 from mmengine.dist import is_main_process
 
-from opencompass.openicl import PromptTemplate
+from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.utils.prompt import PromptList
 
 
diff --git a/opencompass/openicl/icl_retriever/icl_mdl_retriever.py b/opencompass/openicl/icl_retriever/icl_mdl_retriever.py
index 1bd324633..43fe12d1e 100644
--- a/opencompass/openicl/icl_retriever/icl_mdl_retriever.py
+++ b/opencompass/openicl/icl_retriever/icl_mdl_retriever.py
@@ -7,7 +7,7 @@
 import tqdm
 from transformers import AutoModelForCausalLM
 
-from opencompass.openicl import PromptTemplate
+from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever.icl_topk_retriever import TopkRetriever
 from opencompass.openicl.utils.logging import get_logger
 from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS