diff --git a/.gitignore b/.gitignore
index f1181632..50baa8d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,4 @@ tensorboard
 external_tools
 pretrained_models
 s3prl_hub
+whisper_hub
diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
index ad419ccf..1a8fd909 100644
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
@@ -1,6 +1,8 @@
 This is a **WeSpeaker** recipe for the Voxceleb 1&2 dataset. VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube. See https://www.robots.ox.ac.uk/~vgg/data/voxceleb/ for more detailed information.
 
 The following recipes are provided:
+* v1: **Fully-Supervised** train on Voxceleb 1 development set and evaluate on Voxceleb1-O trials.
+
 * v2: **Fully-Supervised** train on Voxceleb 2 development set and evaluate on three official trials.
 
 * v2_deprecated: Deprecated version of fully-supervised train on Voxceleb dataset (deprecated IO).
diff --git a/examples/voxceleb/v1/Whisper-PMFA/README.md b/examples/voxceleb/v1/Whisper-PMFA/README.md
new file mode 100644
index 00000000..88af1615
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/README.md
@@ -0,0 +1,24 @@
+## Results
+
+* Setup: mel80, num_frms500, epoch8, ArcMargin, aug_prob0.6, speed_perturb (no spec_aug)
+
+* Scoring: cosine (sub mean of vox1_dev), AS-Norm
+
+* Metric: EER(%)
+
+* 🔥 UPDATE 2024.08: We support Whisper based speaker verification framework Whisper-PMFA. Related papers:
+
+    * [Whisper-PMFA: Partial Multi-Scale Feature Aggregation for Speaker Verification using Whisper Models ](https://arxiv.org/pdf/2408.15585)
+
+
+
+| Model                                | AS-Norm | Params | vox1-O-clean |
+| :----------------------------------- | ------- | ------ | :----------: |
+| ECAPA_TDNN_GLOB_c512-ASTP-emb192     | ×       | 6.19M  |     2.23     |
+|                                      | √       | 6.19M  |     2.00     |
+| ResNet34-TSTP-emb256                 | ×       | 6.63M  |     1.99     |
+|                                      | √       | 6.63M  |     1.88     |
+| Whisper-PMFA                         | ×       | 478.7M |     1.62     |
+|                                      | √       | 478.7M |   **1.42**   |
+| Whisper-PMFA with LoRa (Coming soon) | √       | 10.9M  |     1.62     |
+
diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml
new file mode 100644
index 00000000..47b875bb
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml
@@ -0,0 +1,78 @@
+### train configuraton
+
+exp_dir: exp/test
+gpus: "[0,1]"
+num_avg: 10
+enable_amp: False # whether enable automatic mixed precision training
+
+seed: 42
+num_epochs: 4
+save_epoch_interval: 1 # save model every 5 epochs
+log_batch_interval: 100 # log every 100 batchs
+
+dataloader_args:
+  batch_size: 70
+  num_workers: 12
+  pin_memory: False
+  prefetch_factor: 8
+  drop_last: True
+
+dataset_args:
+  shuffle: True
+  shuffle_args:
+    shuffle_size: 2500
+  resample_rate: 16000
+  speed_perturb: True
+  num_frms: 500
+  aug_prob: 0.6 # prob to add reverb & noise aug per sample
+  frontend: whisper_encoder
+  whisper_encoder_args:
+    frozen: True
+    n_mels: 80
+    num_blocks: 24
+    output_size: 1280
+    n_head: 20
+    layer_st: 16
+    layer_ed: 23
+    model_path: whisper_hub/large-v2.pt
+  spec_aug: False
+  spec_aug_args:
+    num_t_mask: 1
+    num_f_mask: 1
+    max_t: 10
+    max_f: 8
+    prob: 0.6
+
+model: Whisper_PMFA_large_v2
+model_init: null
+model_args:
+  embed_dim: 192
+projection_args:
+  project_type: "arc_margin" # add_margin, arc_margin, sphere, softmax
+  scale: 32.0
+  easy_margin: False
+
+margin_scheduler: MarginScheduler
+margin_update:
+  initial_margin: 0.2
+  final_margin: 0.2
+  increase_start_epoch: 0
+  fix_start_epoch: 30
+  update_margin: True
+  increase_type: "exp" # exp, linear
+
+loss: CrossEntropyLoss
+loss_args: {}
+
+optimizer: SGD
+optimizer_args:
+  momentum: 0.9
+  nesterov: True
+  weight_decay: 0.0001
+
+scheduler: ExponentialDecrease
+scheduler_args:
+  initial_lr: 0.0025
+  final_lr: 0.00113
+  warm_up_epoch: 0
+  warm_from_zero: False
diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml
new file mode 100644
index 00000000..aa936979
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml
@@ -0,0 +1,77 @@
+### train configuraton
+
+exp_dir: exp/test
+gpus: "[0,1]"
+num_avg: 10
+enable_amp: False # whether enable automatic mixed precision training
+
+seed: 42
+num_epochs: 8
+save_epoch_interval: 1 # save model every 5 epochs
+log_batch_interval: 100 # log every 100 batchs
+
+dataloader_args:
+  batch_size: 15
+  num_workers: 12
+  pin_memory: False
+  prefetch_factor: 8
+  drop_last: True
+
+dataset_args:
+  shuffle: True
+  shuffle_args:
+    shuffle_size: 2500
+  resample_rate: 16000
+  speed_perturb: True
+  num_frms: 500
+  aug_prob: 0.6 # prob to add reverb & noise aug per sample
+  frontend: whisper_encoder
+  whisper_encoder_args:
+    frozen: False
+    n_mels: 80
+    num_blocks: 24
+    output_size: 1280
+    n_head: 20
+    layer_st: 16
+    layer_ed: 23
+  spec_aug: False
+  spec_aug_args:
+    num_t_mask: 1
+    num_f_mask: 1
+    max_t: 10
+    max_f: 8
+    prob: 0.6
+
+model: Whisper_PMFA_large_v2
+model_init: null
+model_args:
+  embed_dim: 192
+projection_args:
+  project_type: "arc_margin" # add_margin, arc_margin, sphere, softmax
+  scale: 32.0
+  easy_margin: False
+
+margin_scheduler: MarginScheduler
+margin_update:
+  initial_margin: 0.2
+  final_margin: 0.2
+  increase_start_epoch: 0
+  fix_start_epoch: 30
+  update_margin: True
+  increase_type: "exp" # exp, linear
+
+loss: CrossEntropyLoss
+loss_args: {}
+
+optimizer: SGD
+optimizer_args:
+  momentum: 0.9
+  nesterov: True
+  weight_decay: 0.0001
+
+scheduler: ExponentialDecrease
+scheduler_args:
+  initial_lr: 0.0025
+  final_lr: 0.00073
+  warm_up_epoch: 0
+  warm_from_zero: False
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/download_data.sh b/examples/voxceleb/v1/Whisper-PMFA/local/download_data.sh
new file mode 100755
index 00000000..61f58914
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/local/download_data.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+download_dir=data/download_data
+
+. tools/parse_options.sh || exit 1
+
+[ ! -d ${download_dir} ] && mkdir -p ${download_dir}
+
+if [ ! -f ${download_dir}/musan.tar.gz ]; then
+  echo "Downloading musan.tar.gz ..."
+  wget --no-check-certificate https://openslr.elda.org/resources/17/musan.tar.gz -P ${download_dir}
+  md5=$(md5sum ${download_dir}/musan.tar.gz | awk '{print $1}')
+  [ $md5 != "0c472d4fc0c5141eca47ad1ffeb2a7df" ] && echo "Wrong md5sum of musan.tar.gz" && exit 1
+fi
+
+if [ ! -f ${download_dir}/rirs_noises.zip ]; then
+  echo "Downloading rirs_noises.zip ..."
+  wget --no-check-certificate https://us.openslr.org/resources/28/rirs_noises.zip -P ${download_dir}
+  md5=$(md5sum ${download_dir}/rirs_noises.zip | awk '{print $1}')
+  [ $md5 != "e6f48e257286e05de56413b4779d8ffb" ] && echo "Wrong md5sum of rirs_noises.zip" && exit 1
+fi
+
+if [ ! -f ${download_dir}/vox1_test_wav.zip ]; then
+  echo "Downloading vox1_test_wav.zip ..."
+  wget --no-check-certificate https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip -P ${download_dir}
+  md5=$(md5sum ${download_dir}/vox1_test_wav.zip | awk '{print $1}')
+  [ $md5 != "185fdc63c3c739954633d50379a3d102" ] && echo "Wrong md5sum of vox1_test_wav.zip" && exit 1
+fi
+
+if [ ! -f ${download_dir}/vox1_dev_wav.zip ]; then
+  echo "Downloading vox1_dev_wav.zip ..."
+  for part in a b c d; do
+    wget --no-check-certificate https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_parta${part} -P ${download_dir} &
+  done
+  wait
+  cat ${download_dir}/vox1_dev* >${download_dir}/vox1_dev_wav.zip
+  md5=$(md5sum ${download_dir}/vox1_dev_wav.zip | awk '{print $1}')
+  [ $md5 != "ae63e55b951748cc486645f532ba230b" ] && echo "Wrong md5sum of vox1_dev_wav.zip" && exit 1
+fi
+
+
+echo "Download success !!!"
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/download_whisper.sh b/examples/voxceleb/v1/Whisper-PMFA/local/download_whisper.sh
new file mode 100755
index 00000000..d0bf7a6b
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/local/download_whisper.sh
@@ -0,0 +1,13 @@
+download_dir=data/whisper_pretrained_model
+
+. tools/parse_options.sh || exit 1
+
+[ ! -d ${download_dir} ] && mkdir -p ${download_dir}
+
+if [ ! -f ${download_dir}/large-v2.pt ]; then
+  echo "Downloading large-v2.pt ..."
+  wget --no-check-certificate https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt -P ${download_dir}
+  md5=$(md5sum ${download_dir}/large-v2.pt | awk '{print $1}')
+  [ $md5 != "668764447eeda98eeba5ef7bfcb4cc3d" ] && echo "Wrong md5sum of musan.tar.gz" && exit 1
+fi
+
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/extract_vox.sh b/examples/voxceleb/v1/Whisper-PMFA/local/extract_vox.sh
new file mode 100755
index 00000000..613012c1
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/local/extract_vox.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exp_dir=''
+model_path=''
+nj=4
+gpus="[0,1]"
+data_type="shard"  # shard/raw/feat
+data=data
+
+. tools/parse_options.sh
+set -e
+
+data_name_array=("vox1_dev" "vox1_test")
+data_list_path_array=("${data}/vox1_dev/${data_type}.list" "${data}/vox1_test/${data_type}.list")
+data_scp_path_array=("${data}/vox1_dev/wav.scp" "${data}/vox1_test/wav.scp") # to count the number of wavs
+nj_array=($nj $nj)
+batch_size_array=(16 1) # batch_size of test set must be 1 !!!
+num_workers_array=(4 1)
+count=${#data_name_array[@]}
+
+for i in $(seq 0 $(($count - 1))); do
+  wavs_num=$(wc -l ${data_scp_path_array[$i]} | awk '{print $1}')
+  bash tools/extract_embedding.sh --exp_dir ${exp_dir} \
+    --model_path $model_path \
+    --data_type ${data_type} \
+    --data_list ${data_list_path_array[$i]} \
+    --wavs_num ${wavs_num} \
+    --store_dir ${data_name_array[$i]} \
+    --batch_size ${batch_size_array[$i]} \
+    --num_workers ${num_workers_array[$i]} \
+    --nj ${nj_array[$i]} \
+    --gpus $gpus &
+done
+
+wait
+
+echo "Embedding dir is (${exp_dir}/embeddings)."
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/prepare_data.sh b/examples/voxceleb/v1/Whisper-PMFA/local/prepare_data.sh
new file mode 100755
index 00000000..6b55499e
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/local/prepare_data.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stage=-1
+stop_stage=-1
+data=data
+
+. tools/parse_options.sh || exit 1
+
+data=`realpath ${data}`
+download_dir=${data}/download_data
+rawdata_dir=${data}/raw_data
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "Download musan.tar.gz, rirs_noises.zip, vox1_test_wav.zip, and vox1_dev_wav.zip."
+  echo "This may take a long time. Thus we recommand you to download all archives above in your own way first."
+
+  ./local/download_data.sh --download_dir ${download_dir}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "Decompress all archives ..."
+  echo "This could take some time ..."
+
+  for archive in musan.tar.gz rirs_noises.zip vox1_test_wav.zip vox1_dev_wav.zip; do
+    [ ! -f ${download_dir}/$archive ] && echo "Archive $archive not exists !!!" && exit 1
+  done
+  [ ! -d ${rawdata_dir} ] && mkdir -p ${rawdata_dir}
+
+  if [ ! -d ${rawdata_dir}/musan ]; then
+    tar -xzvf ${download_dir}/musan.tar.gz -C ${rawdata_dir}
+  fi
+
+  if [ ! -d ${rawdata_dir}/RIRS_NOISES ]; then
+    unzip ${download_dir}/rirs_noises.zip -d ${rawdata_dir}
+  fi
+
+  if [ ! -d ${rawdata_dir}/voxceleb1 ]; then
+    mkdir -p ${rawdata_dir}/voxceleb1/test ${rawdata_dir}/voxceleb1/dev
+    unzip ${download_dir}/vox1_test_wav.zip -d ${rawdata_dir}/voxceleb1/test
+    unzip ${download_dir}/vox1_dev_wav.zip -d ${rawdata_dir}/voxceleb1/dev
+  fi
+
+  echo "Decompress success !!!"
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Prepare wav.scp for each dataset ..."
+  export LC_ALL=C # kaldi config
+
+  mkdir -p ${data}/musan ${data}/rirs ${data}/vox1_dev ${data}/vox1_test
+  # musan
+  find ${rawdata_dir}/musan -name "*.wav" | awk -F"/" '{print $(NF-2)"/"$(NF-1)"/"$NF,$0}' >${data}/musan/wav.scp
+  # rirs
+  find ${rawdata_dir}/RIRS_NOISES/simulated_rirs -name "*.wav" | awk -F"/" '{print $(NF-2)"/"$(NF-1)"/"$NF,$0}' >${data}/rirs/wav.scp
+  # vox1 dev
+  find ${rawdata_dir}/voxceleb1/dev -name "*.wav" | awk -F"/" '{print $(NF-2)"/"$(NF-1)"/"$NF,$0}' | sort >${data}/vox1_dev/wav.scp
+  awk '{print $1}' ${data}/vox1_dev/wav.scp | awk -F "/" '{print $0,$1}' >${data}/vox1_dev/utt2spk
+  ./tools/utt2spk_to_spk2utt.pl ${data}/vox1_dev/utt2spk >${data}/vox1_dev/spk2utt
+  # vox1 test
+  find ${rawdata_dir}/voxceleb1/test -name "*.wav" | awk -F"/" '{print $(NF-2)"/"$(NF-1)"/"$NF,$0}' | sort >${data}/vox1_test/wav.scp
+  awk '{print $1}' ${data}/vox1_test/wav.scp | awk -F "/" '{print $0,$1}' >${data}/vox1_test/utt2spk
+  ./tools/utt2spk_to_spk2utt.pl ${data}/vox1_test/utt2spk >${data}/vox1_test/spk2utt
+
+  if [ ! -d ${data}/vox1_test/trials ]; then
+    echo "Download trials for vox1_test ..."
+    mkdir -p ${data}/vox1_test/trials
+    #wget --no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test.txt -O ${data}/vox1_test/trials/vox1-O.txt
+    wget --no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt -O ${data}/vox1_test/trials/vox1-O\(cleaned\).txt
+    # transform them into kaldi trial format
+    awk '{if($1==0)label="nontarget";else{label="target"}; print $2,$3,label}' ${data}/vox1_test/trials/vox1-O\(cleaned\).txt >${data}/vox1_test/trials/vox1_O_cleaned.kaldi
+  fi
+
+  echo "Success !!!"
+fi
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/score.sh b/examples/voxceleb/v1/Whisper-PMFA/local/score.sh
new file mode 100755
index 00000000..5b81a883
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/local/score.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exp_dir=
+trials="vox1_O_cleaned.kaldi vox1_E_cleaned.kaldi vox1_H_cleaned.kaldi"
+data=data
+
+stage=-1
+stop_stage=-1
+
+. tools/parse_options.sh
+. path.sh
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "apply cosine scoring ..."
+  mkdir -p ${exp_dir}/scores
+  trials_dir=${data}/vox1_test/trials
+  for x in $trials; do
+    echo $x
+    python wespeaker/bin/score.py \
+      --exp_dir ${exp_dir} \
+      --eval_scp_path ${exp_dir}/embeddings/vox1_test/xvector.scp \
+      --cal_mean True \
+      --cal_mean_dir ${exp_dir}/embeddings/vox1_dev \
+      ${trials_dir}/${x}
+  done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "compute metrics (EER/minDCF) ..."
+  scores_dir=${exp_dir}/scores
+  for x in $trials; do
+    python wespeaker/bin/compute_metrics.py \
+        --p_target 0.01 \
+        --c_fa 1 \
+        --c_miss 1 \
+        ${scores_dir}/${x}.score \
+        2>&1 | tee -a ${scores_dir}/vox1_cos_result
+
+    echo "compute DET curve ..."
+    python wespeaker/bin/compute_det.py \
+        ${scores_dir}/${x}.score
+  done
+fi
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh b/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh
new file mode 100755
index 00000000..73431093
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/local/score_norm.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+score_norm_method="asnorm"  # asnorm/snorm
+cohort_set=vox2_dev
+top_n=100
+exp_dir=
+trials="vox1_O_cleaned.kaldi vox1_E_cleaned.kaldi vox1_H_cleaned.kaldi"
+data=data
+
+stage=-1
+stop_stage=-1
+
+. tools/parse_options.sh
+. path.sh
+
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  echo "compute mean xvector"
+  python tools/vector_mean.py \
+    --spk2utt ${data}/${cohort_set}/spk2utt \
+    --xvector_scp $exp_dir/embeddings/${cohort_set}/xvector.scp \
+    --spk_xvector_ark $exp_dir/embeddings/${cohort_set}/spk_xvector.ark
+fi
+
+output_name=${cohort_set}_${score_norm_method}
+[ "${score_norm_method}" == "asnorm" ] && output_name=${output_name}${top_n}
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  echo "compute norm score"
+  for x in $trials; do
+    python wespeaker/bin/score_norm.py \
+      --score_norm_method $score_norm_method \
+      --top_n $top_n \
+      --trial_score_file $exp_dir/scores/${x}.score \
+      --score_norm_file $exp_dir/scores/${output_name}_${x}.score \
+      --cohort_emb_scp ${exp_dir}/embeddings/${cohort_set}/spk_xvector.scp \
+      --eval_emb_scp ${exp_dir}/embeddings/vox1_test/xvector.scp \
+      --mean_vec_path ${exp_dir}/embeddings/vox1_dev/mean_vec.npy
+  done
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  echo "compute metrics"
+  for x in ${trials}; do
+    scores_dir=${exp_dir}/scores
+    python wespeaker/bin/compute_metrics.py \
+      --p_target 0.01 \
+      --c_fa 1 \
+      --c_miss 1 \
+      ${scores_dir}/${output_name}_${x}.score \
+      2>&1 | tee -a ${scores_dir}/vox1_${score_norm_method}${top_n}_result
+
+    python wespeaker/bin/compute_det.py \
+      ${scores_dir}/${output_name}_${x}.score
+  done
+fi
diff --git a/examples/voxceleb/v1/Whisper-PMFA/path.sh b/examples/voxceleb/v1/Whisper-PMFA/path.sh
new file mode 100644
index 00000000..e7917ccb
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/path.sh
@@ -0,0 +1,5 @@
+export PATH=$PWD:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PWD:$PYTHONPATH
diff --git a/examples/voxceleb/v1/Whisper-PMFA/run.sh b/examples/voxceleb/v1/Whisper-PMFA/run.sh
new file mode 100644
index 00000000..42b63023
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/run.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Copyright 2022 Hongji Wang (jijijiang77@gmail.com)
+#           2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn)
+#           2022 Zhengyang Chen (chenzhengyang117@gmail.com)
+
+. ./path.sh || exit 1
+
+stage=3
+stop_stage=3
+
+data=data
+data_type="raw"  # shard/raw
+model=whisper_PMFA_large_v2
+
+exp_dir=exp/Whisper_PMFA_large_v2_voxceleb1_mel_5s
+
+gpus="[0]"
+num_avg=10
+checkpoint=
+
+trials="vox1_O_cleaned.kaldi"
+
+score_norm_method="asnorm"  # asnorm/snorm
+top_n=300
+
+. tools/parse_options.sh || exit 1
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  echo "Preparing datasets ..."
+  ./local/prepare_data.sh --stage 1 --stop_stage 3 --data ${data}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  echo "Covert train and test data to ${data_type}..."
+  for dset in vox1_dev vox1_test; do
+    if [ $data_type == "shard" ]; then
+      python tools/make_shard_list.py --num_utts_per_shard 1000 \
+          --num_threads 16 \
+          --prefix shards \
+          --shuffle \
+          ${data}/$dset/wav.scp ${data}/$dset/utt2spk \
+          ${data}/$dset/shards ${data}/$dset/shard.list
+    else
+      python tools/make_raw_list.py ${data}/$dset/wav.scp \
+          ${data}/$dset/utt2spk ${data}/$dset/raw.list
+    fi
+  done
+  # Convert all musan data to LMDB
+  python tools/make_lmdb.py ${data}/musan/wav.scp ${data}/musan/lmdb
+  # Convert all rirs data to LMDB
+  python tools/make_lmdb.py ${data}/rirs/wav.scp ${data}/rirs/lmdb
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "Start training with frozen whisper parameter..."
+  config=conf/whisper_PMFA_stage0.yaml
+  num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
+  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+    wespeaker/bin/train.py --config $config \
+      --exp_dir ${exp_dir} \
+      --gpus $gpus \
+      --num_avg ${num_avg} \
+      --data_type "${data_type}" \
+      --train_data ${data}/vox1_dev/${data_type}.list \
+      --train_label ${data}/vox1_dev/utt2spk \
+      --reverb_data ${data}/rirs/lmdb \
+      --noise_data ${data}/musan/lmdb \
+      --model ${model}
+fi
+
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  echo "Start training with all parameter..."
+
+  if [ -f ${exp_dir}/"config.yaml" ]; then
+    mv ${exp_dir}/"config.yaml" ${exp_dir}/"config_stage0.yaml"
+  fi
+  if [ -f ${exp_dir}/models/"final_model.pt" ]; then
+    mv ${exp_dir}/models/"final_model.pt" ${exp_dir}/models/"final_model_stage0.pt"
+  fi
+
+  config=conf/whisper_PMFA_stage1.yaml
+  num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
+  checkpoint=${exp_dir}/models/model_4.pt
+  torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
+    wespeaker/bin/train.py --config $config \
+      --exp_dir ${exp_dir} \
+      --gpus $gpus \
+      --num_avg ${num_avg} \
+      --data_type "${data_type}" \
+      --train_data ${data}/vox1_dev/${data_type}.list \
+      --train_label ${data}/vox1_dev/utt2spk \
+      --reverb_data ${data}/rirs/lmdb \
+      --noise_data ${data}/musan/lmdb \
+      --model ${model} \
+      --checkpoint ${checkpoint}
+fi
+
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  model_path=$exp_dir/models/final_model.pt
+  echo "Extract embeddings ..."
+  local/extract_vox.sh \
+    --exp_dir $exp_dir --model_path $model_path \
+    --nj 2 --gpus $gpus --data_type raw --data ${data}
+fi
+
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  echo "Score ..."
+  local/score.sh \
+    --stage 1 --stop-stage 2 \
+    --exp_dir $exp_dir \
+    --data ${data} \
+    --trials "$trials"
+fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  echo "Score norm ..."
+  local/score_norm.sh \
+    --stage 1 --stop-stage 3 \
+    --score_norm_method $score_norm_method \
+    --cohort_set vox1_dev \
+    --top_n $top_n \
+    --exp_dir $exp_dir \
+    --data ${data} \
+    --trials "$trials"
+fi
diff --git a/examples/voxceleb/v1/Whisper-PMFA/tools b/examples/voxceleb/v1/Whisper-PMFA/tools
new file mode 120000
index 00000000..8a51cc50
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/tools
@@ -0,0 +1 @@
+../../../../tools
diff --git a/examples/voxceleb/v1/Whisper-PMFA/wespeaker b/examples/voxceleb/v1/Whisper-PMFA/wespeaker
new file mode 120000
index 00000000..b7f7ab8b
--- /dev/null
+++ b/examples/voxceleb/v1/Whisper-PMFA/wespeaker
@@ -0,0 +1 @@
+../../../../wespeaker
diff --git a/wespeaker/bin/extract.py b/wespeaker/bin/extract.py
index dfa5fdac..032d98c3 100644
--- a/wespeaker/bin/extract.py
+++ b/wespeaker/bin/extract.py
@@ -47,7 +47,7 @@ def extract(config='conf/config.yaml', **kwargs):
     # model: frontend (optional) => speaker model
     model = get_speaker_model(configs['model'])(**configs['model_args'])
     frontend_type = test_conf.get('frontend', 'fbank')
-    if frontend_type == 's3prl':
+    if frontend_type != 'fbank':
         frontend_args = frontend_type + "_args"
         print('Initializing frontend model (this could take some time) ...')
         frontend = frontend_class_dict[frontend_type](
diff --git a/wespeaker/bin/train.py b/wespeaker/bin/train.py
index 3c534353..d4e440be 100644
--- a/wespeaker/bin/train.py
+++ b/wespeaker/bin/train.py
@@ -109,18 +109,17 @@ def train(config='conf/config.yaml', **kwargs):
     logger.info("<== Model ==>")
     # frontend: fbank or s3prl
     frontend_type = configs['dataset_args'].get('frontend', 'fbank')
-    if frontend_type == 's3prl':
+    if frontend_type != "fbank":
         frontend_args = frontend_type + "_args"
         frontend = frontend_class_dict[frontend_type](
             **configs['dataset_args'][frontend_args],
             sample_rate=configs['dataset_args']['resample_rate'])
-        # speaker model
         configs['model_args']['feat_dim'] = frontend.output_size()
         model = get_speaker_model(configs['model'])(**configs['model_args'])
         model.add_module("frontend", frontend)
-    else:  # == 'fbank'
-        # speaker model
+    else:
         model = get_speaker_model(configs['model'])(**configs['model_args'])
+
     if rank == 0:
         num_params = sum(param.numel() for param in model.parameters())
         logger.info('speaker_model size: {}'.format(num_params))
diff --git a/wespeaker/frontend/__init__.py b/wespeaker/frontend/__init__.py
index 9b9fd27b..5278e52c 100644
--- a/wespeaker/frontend/__init__.py
+++ b/wespeaker/frontend/__init__.py
@@ -14,5 +14,9 @@
 
 
 from .s3prl import S3prlFrontend
+from .whisper_encoder import whisper_encoder
 
-frontend_class_dict = {'s3prl' : S3prlFrontend}
+frontend_class_dict = {
+    'fbank': None,
+    's3prl': S3prlFrontend,
+    'whisper_encoder': whisper_encoder}
diff --git a/wespeaker/frontend/whisper_encoder.py b/wespeaker/frontend/whisper_encoder.py
new file mode 100644
index 00000000..949bafd5
--- /dev/null
+++ b/wespeaker/frontend/whisper_encoder.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2024 Yiyang Zhao (zhaoyy22@mails.tsinghua.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch import Tensor
+from torch import nn
+
+from typing import Iterable, Optional
+
+import os
+import hashlib
+import whisper
+import logging
+import urllib.request
+
+
+class Linear(nn.Linear):
+    def forward(self, x: Tensor) -> Tensor:
+        return F.linear(
+            x, self.weight.to(
+                x.dtype), None if self.bias is None else self.bias.to(
+                x.dtype))
+
+
+class Conv1d(nn.Conv1d):
+    def _conv_forward(self, x: Tensor, weight: Tensor,
+                      bias: Optional[Tensor]) -> Tensor:
+        return super()._conv_forward(
+            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
+        )
+
+
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        return super().forward(x.float()).type(x.dtype)
+
+
+def sinusoids(length, channels, max_timescale=10000):
+    """Returns sinusoids for positional embedding"""
+    assert channels % 2 == 0
+    log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment *
+                               torch.arange(channels // 2))
+    scaled_time = torch.arange(
+        length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        q = self.query(x)
+
+        if kv_cache is None or xa is None or self.key not in kv_cache:
+            # hooks, if installed (i.e. kv_cache is not None),
+            # will prepend the cached kv tensors; otherwise,
+            # perform key/value projections for self- or
+            # cross-attention as usual.
+            k = self.key(x if xa is None else xa)
+            v = self.value(x if xa is None else xa)
+        else:
+            # for cross-attention, calculate keys and values once
+            # and reuse in subsequent calls.
+            k = kv_cache[self.key]
+            v = kv_cache[self.value]
+
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+
+    def qkv_attention(
+            self,
+            q: Tensor,
+            k: Tensor,
+            v: Tensor,
+            mask: Optional[Tensor] = None):
+        n_batch, n_ctx, n_state = q.shape
+        scale = (n_state // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+
+        qk = q @ k
+        if mask is not None:
+            qk = qk + mask[:n_ctx, :n_ctx]
+        qk = qk.float()
+
+        w = F.softmax(qk, dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int,
+                 cross_attention: bool = False):
+        super().__init__()
+
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+
+        self.cross_attn = MultiHeadAttention(
+            n_state, n_head) if cross_attention else None
+        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
+
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(
+                n_state, n_mlp), nn.GELU(), Linear(
+                n_mlp, n_state))
+        self.mlp_ln = LayerNorm(n_state)
+
+    def forward(
+        self,
+        x: Tensor,
+        xa: Optional[Tensor] = None,
+        mask: Optional[Tensor] = None,
+        kv_cache: Optional[dict] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache)[0]
+        if self.cross_attn:
+            x = x + self.cross_attn(self.cross_attn_ln(x),
+                                    xa, kv_cache=kv_cache)[0]
+        x = x + self.mlp(self.mlp_ln(x))
+        return x
+
+
+class AudioEncoder(nn.Module):
+    def __init__(
+            self,
+            n_mels: int,
+            n_ctx: int,
+            n_state: int,
+            n_head: int,
+            n_layer: int,
+            layer_st: int,
+            layer_ed: int):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(
+            n_state,
+            n_state,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state))
+
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        # self.ln_post = LayerNorm(n_state)
+        # ------------------------ADD:add new layer norm------------------------
+        self.ln_post2 = LayerNorm(n_state * (layer_ed - layer_st + 1))
+
+        self.layer_st = layer_st
+        self.layer_ed = layer_ed
+
+    def forward(self, x: Tensor):
+        """
+        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+            the mel spectrogram of the audio
+        """
+        # ---------------------------ADD------------------------
+        x = x.permute(0, 2, 1)
+
+        x = x.squeeze(1)
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)
+
+        # ------------Change:Tailor the positional_embedding----------
+        assert x.shape[2:] == self.positional_embedding.shape[1:], \
+            "incorrect audio shape"
+        if self.positional_embedding.shape[0] > x.shape[1]:
+            temp_positional_embedding = self.positional_embedding[:x.shape[1], :]
+        elif self.positional_embedding.shape[0] < x.shape[1]:
+            x = x[:, :self.positional_embedding.shape[0], :]
+            temp_positional_embedding = self.positional_embedding
+        else:
+            temp_positional_embedding = self.positional_embedding
+
+        x = (x + temp_positional_embedding).to(x.dtype)
+
+        # ----------Change: Concat block outputs------
+        out = []
+        for i, block in enumerate(self.blocks):
+            x = block(x)
+            if self.layer_st <= i <= self.layer_ed:
+                out.append(x)
+
+        xs = torch.cat(out, dim=-1)
+
+        xs = self.ln_post2(xs)
+        return xs
+
+
+class whisper_encoder(torch.nn.Module):
+    def __init__(self,
+                 frozen=False,
+                 n_mels=80,
+                 num_blocks=24,
+                 output_size=1280,
+                 n_head=20,
+                 layer_st=16,
+                 layer_ed=23,
+                 model_path=None,
+                 sample_rate=16000
+                 ):
+        super(whisper_encoder, self).__init__()
+        self.encoder = AudioEncoder(
+            n_mels=n_mels,
+            n_layer=num_blocks,
+            n_state=output_size,
+            n_ctx=1500,
+            n_head=n_head,
+            layer_st=layer_st,
+            layer_ed=layer_ed)
+        # 0 for freeze finetune, 1 for all parameters finetune
+        self.frozen = frozen
+        self.single_output_size = output_size
+        self.concat_layer = layer_ed - layer_st + 1
+        self.n_mels = n_mels
+
+        # load model
+        if model_path:
+            if dist.is_initialized():
+                if dist.get_rank() == 0:
+                    self._download_whisper_model(model_path)
+                dist.barrier()  # Wait for rank 0 to finish downloading
+                self._load_pretrained_weights(model_path)
+            else:
+                self._download_whisper_model(model_path)
+                self._load_pretrained_weights(model_path)
+
+        if self.frozen:
+            for param in self.encoder.parameters():
+                param.requires_grad_(False)
+
+    def _download_whisper_model(self, model_path='whisper_hub/large-v2.pt'):
+        download_dir = os.path.dirname(model_path)
+        if not os.path.exists(download_dir):
+            os.makedirs(download_dir)
+        if not os.path.isfile(model_path):
+            print("Downloading large-v2.pt ...")
+            url = 'https://openaipublic.azureedge.net/main/whisper/models/' \
+                '81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/' \
+                'large-v2.pt'
+
+            urllib.request.urlretrieve(url, model_path)
+
+            md5 = hashlib.md5(open(model_path, 'rb').read()).hexdigest()
+
+            if md5 != "668764447eeda98eeba5ef7bfcb4cc3d":
+                print("Wrong md5sum of large-v2.pt")
+                os.remove(model_path)
+                raise ValueError("MD5 checksum does not match!")
+        else:
+            print("Model already downloaded.")
+
+    def _load_pretrained_weights(self, model_path):
+        print(f"Loading pretrained weights from {model_path}...")
+
+        state_dict = torch.load(model_path, map_location=torch.device('cpu'))
+        state_dict = state_dict['model_state_dict']
+
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            new_key = k.replace('encoder.', '', 1)
+            new_state_dict[new_key] = v
+
+        missing_keys, unexpected_keys = self.encoder.load_state_dict(
+            new_state_dict, strict=False)
+        print("Pretrained weights loaded successfully.")
+        for key in missing_keys:
+            logging.warning('missing tensor: {}'.format(key))
+        for key in unexpected_keys:
+            logging.warning('unexpected tensor: {}'.format(key))
+
+    def output_size(self):
+        return self.single_output_size * self.concat_layer
+
+    def forward(self, wavs, wavs_len):
+        with torch.no_grad():
+            processed_feats = []
+            for i in range(wavs.size(0)):
+                tf_tensor = wavs[i].unsqueeze(0).to(wavs.device)
+                mat = whisper.log_mel_spectrogram(
+                    tf_tensor.squeeze(), n_mels=self.n_mels)
+                processed_feats.append(mat)
+
+            feat = torch.stack(processed_feats, dim=0).to(wavs.device)
+
+        feat = feat.transpose(1, 2)
+        # (B,T,F)
+        x = self.encoder(feat)
+        return x, None
diff --git a/wespeaker/models/speaker_model.py b/wespeaker/models/speaker_model.py
index 4ef4a311..d0c10949 100644
--- a/wespeaker/models/speaker_model.py
+++ b/wespeaker/models/speaker_model.py
@@ -20,9 +20,11 @@
 import wespeaker.models.eres2net as eres2net
 import wespeaker.models.gemini_dfresnet as gemini
 import wespeaker.models.res2net as res2net
+import wespeaker.models.whisper_PMFA as whisper_PMFA
 import wespeaker.models.redimnet as redimnet
 
 
+
 def get_speaker_model(model_name: str):
     if model_name.startswith("XVEC"):
         return getattr(tdnn, model_name)
@@ -40,6 +42,8 @@ def get_speaker_model(model_name: str):
         return getattr(res2net, model_name)
     elif model_name.startswith("Gemini"):
         return getattr(gemini, model_name)
+    elif model_name.startswith("whisper_PMFA"):
+        return getattr(whisper_PMFA, model_name)
     elif model_name.startswith("ReDimNet"):
         return getattr(redimnet, model_name)
     else:  # model_name error !!!
diff --git a/wespeaker/models/whisper_PMFA.py b/wespeaker/models/whisper_PMFA.py
new file mode 100644
index 00000000..c11483d8
--- /dev/null
+++ b/wespeaker/models/whisper_PMFA.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2024 Yiyang Zhao (zhaoyy22@mails.tsinghua.edu.cn)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+
+import wespeaker.models.pooling_layers as pooling_layers
+
+
+class BatchNorm1d(nn.Module):
+    """Applies 1d batch normalization to the input tensor.
+
+    Arguments
+    ---------
+    input_shape : tuple
+        The expected shape of the input. Alternatively, use ``input_size``.
+    input_size : int
+        The expected size of the input. Alternatively, use ``input_shape``.
+    eps : float
+        This value is added to std deviation estimation to improve the numerical
+        stability.
+    momentum : float
+        It is a value used for the running_mean and running_var computation.
+    affine : bool
+        When set to True, the affine parameters are learned.
+    track_running_stats : bool
+        When set to True, this module tracks the running mean and variance,
+        and when set to False, this module does not track such statistics.
+    combine_batch_time : bool
+        When true, it combines batch an time axis.
+
+
+    Example
+    -------
+    >>> input = torch.randn(100, 10)
+    >>> norm = BatchNorm1d(input_shape=input.shape)
+    >>> output = norm(input)
+    >>> output.shape
+    torch.Size([100, 10])
+    """
+
+    def __init__(
+        self,
+        input_shape=None,
+        input_size=None,
+        eps=1e-05,
+        momentum=0.1,
+        affine=True,
+        track_running_stats=True,
+        combine_batch_time=False,
+        skip_transpose=True,
+    ):
+        super().__init__()
+        self.combine_batch_time = combine_batch_time
+        self.skip_transpose = skip_transpose
+
+        if input_size is None and skip_transpose:
+            input_size = input_shape[1]
+        elif input_size is None:
+            input_size = input_shape[-1]
+
+        self.norm = nn.BatchNorm1d(
+            input_size,
+            eps=eps,
+            momentum=momentum,
+            affine=affine,
+            track_running_stats=track_running_stats,
+        )
+
+    def forward(self, x):
+        """Returns the normalized input tensor.
+
+        Arguments
+        ---------
+        x : torch.Tensor (batch, time, [channels])
+            input to normalize. 2d or 3d tensors are expected in input
+            4d tensors can be used when combine_dims=True.
+        """
+        shape_or = x.shape
+        if self.combine_batch_time:
+            if x.ndim == 3:
+                x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
+            else:
+                x = x.reshape(
+                    shape_or[0] * shape_or[1], shape_or[3], shape_or[2]
+                )
+
+        elif not self.skip_transpose:
+            x = x.transpose(-1, 1)
+
+        x_n = self.norm(x)
+
+        if self.combine_batch_time:
+            x_n = x_n.reshape(shape_or)
+        elif not self.skip_transpose:
+            x_n = x_n.transpose(1, -1)
+
+        return x_n
+
+
+class whisper_PMFA(torch.nn.Module):
+    def __init__(
+            self,
+            output_size=1280,
+            embedding_dim=192,
+            pooling_func='ASTP',
+            global_context_att=True):
+        super(whisper_PMFA, self).__init__()
+        self.pooling = getattr(pooling_layers, pooling_func)(
+            in_dim=output_size, global_context_att=global_context_att)
+        self.bn = BatchNorm1d(input_size=output_size * 2)
+        self.fc = torch.nn.Linear(output_size * 2, embedding_dim)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 1)
+        x = self.pooling(x)
+        x = x.unsqueeze(-1)
+        x = self.bn(x)
+        x = x.permute(0, 2, 1)
+        x = self.fc(x)
+        x = x.squeeze(1)
+        return x
+
+
+def whisper_PMFA_large_v2(feat_dim, embed_dim):
+    return whisper_PMFA(output_size=feat_dim,
+                        embedding_dim=embed_dim
+                        )