Add wav2vec2 support - export and audio tasks modeling (#645)

* wav2vec2 base support * fix outputs for audio-xvector * add CTC modeling * some tests and modeling * add xvector * fix doc * fix doc * try fix tests * disable auto triggered CIs for inf1
huggingface · Jul 11, 2024 · 56cb8a5 · 56cb8a5
1 parent 17fe854
commit 56cb8a5
Show file tree

Hide file tree

Showing 16 changed files with 827 additions and 40 deletions.
diff --git a/.github/workflows/test_inf1_export.yml b/.github/workflows/test_inf1_export.yml
@@ -1,16 +1,7 @@
 name: Optimum neuron / Test INF1 partial export
 
 on:
-  push:
-    branches: [ main ]
-    paths:
-      - "setup.py"
-      - "optimum/**.py"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "setup.py"
-      - "optimum/**.py"
+  workflow_dispatch
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

diff --git a/.github/workflows/test_inf1_full_export.yml b/.github/workflows/test_inf1_full_export.yml
@@ -1,14 +1,7 @@
 name: Optimum neuron / Test INF1 full export
 
 on:
-  push:
-    branches: [ main ]
-    paths:
-      - "optimum/exporters/neuron/*.py"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "optimum/exporters/neuron/*.py"
+  workflow_dispatch
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

diff --git a/.github/workflows/test_inf1_inference.yml b/.github/workflows/test_inf1_inference.yml
@@ -1,16 +1,7 @@
 name: Optimum neuron / Test INF1 inference
 
 on:
-  push:
-    branches: [ main ]
-    paths:
-      - "setup.py"
-      - "optimum/**.py"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "setup.py"
-      - "optimum/**.py"
+  workflow_dispatch
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

diff --git a/.github/workflows/test_inf1_pipelines.yml b/.github/workflows/test_inf1_pipelines.yml
@@ -1,14 +1,7 @@
 name: Optimum neuron / Test INF1 pipelines
 
 on:
-  push:
-    branches: [ main ]
-    paths:
-      - "optimum/neuron/pipelines/**.py"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "optimum/neuron/pipelines/**.py"
+  workflow_dispatch
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

diff --git a/docs/source/package_reference/modeling.mdx b/docs/source/package_reference/modeling.mdx
@@ -82,6 +82,22 @@ The following Neuron model classes are available for computer vision tasks.
 ### NeuronModelForObjectDetection
 [[autodoc]] modeling.NeuronModelForObjectDetection
 
+## Audio
+
+The following auto classes are available for the following audio tasks.
+
+### NeuronModelForAudioClassification
+[[autodoc]] modeling.NeuronModelForAudioClassification
+
+### NeuronModelForAudioFrameClassification
+[[autodoc]] modeling.NeuronModelForAudioFrameClassification
+
+### NeuronModelForCTC
+[[autodoc]] modeling.NeuronModelForCTC
+
+### NeuronModelForXVector
+[[autodoc]] modeling.NeuronModelForXVector
+
 ## Stable Diffusion
 
 The following Neuron model classes are available for stable diffusion tasks.

diff --git a/docs/source/package_reference/supported_models.mdx b/docs/source/package_reference/supported_models.mdx
@@ -53,6 +53,7 @@ limitations under the License.
 | RoFormer                  | feature-extraction, fill-mask, multiple-choice, question-answering, text-classification, token-classification                                 |
 | Swin                      | feature-extraction, image-classification                                                                                                      |
 | T5                        | text2text-generation                                                                                                                          |
+| Wav2Vec2                  | feature-extraction, automatic-speech-recognition, audio-classification, audio-frame-classification, audio-xvector                             |
 | XLM                       | feature-extraction, fill-mask, multiple-choice, question-answering, text-classification, token-classification                                 |
 | ViT                       | feature-extraction, image-classification                                                                                                      |
 | XLM-RoBERTa               | feature-extraction, fill-mask, multiple-choice, question-answering, text-classification, token-classification                                 |

diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py
@@ -249,6 +249,11 @@ def parse_args_neuronx(parser: "ArgumentParser"):
         default=1,
         help=f"Stable diffusion only. Number of images per prompt {doc_input}",
     )
+    input_group.add_argument(
+        "--audio_sequence_length",
+        type=int,
+        help=f"Audio tasks only. Audio sequence length {doc_input}",
+    )
 
     level_group = parser.add_mutually_exclusive_group()
     level_group.add_argument(

diff --git a/optimum/exporters/neuron/config.py b/optimum/exporters/neuron/config.py
@@ -19,6 +19,7 @@
 from typing import List
 
 from ...utils import (
+    DummyAudioInputGenerator,
     DummyBboxInputGenerator,
     DummyInputGenerator,
     DummySeq2SeqDecoderTextInputGenerator,
@@ -59,6 +60,15 @@ class TextAndVisionNeuronConfig(NeuronDefaultConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyVisionInputGenerator, DummyBboxInputGenerator)
 
 
+class AudioNeuronConfig(NeuronDefaultConfig):
+    """
+    Handles audio architectures.
+    """
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyAudioInputGenerator, DummyTextInputGenerator)
+    INPUT_ARGS = ("batch_size", "audio_sequence_length")
+
+
 class TextNeuronDecoderConfig(NeuronDecoderConfig):
     """
     Handles text decoder architectures.

diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
@@ -36,6 +36,7 @@
 )
 from ..tasks import TasksManager
 from .config import (
+    AudioNeuronConfig,
     TextAndVisionNeuronConfig,
     TextEncoderNeuronConfig,
     TextNeuronDecoderConfig,
@@ -402,6 +403,31 @@ def outputs(self) -> List[str]:
         return common_outputs
 
 
+@register_in_tasks_manager(
+    "wav2vec2",
+    *[
+        "feature-extraction",
+        "automatic-speech-recognition",
+        "audio-classification",
+        "audio-frame-classification",
+        "audio-xvector",
+    ],
+)
+class Wav2Vec2NeuronConfig(AudioNeuronConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig
+
+    @property
+    def inputs(self) -> List[str]:
+        return ["input_values"]
+
+    @property
+    def outputs(self) -> List[str]:
+        common_outputs = super().outputs
+        if self.task == "audio-xvector":
+            common_outputs.append("embeddings")
+        return common_outputs
+
+
 @register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers")
 class UNetNeuronConfig(VisionNeuronConfig):
     ATOL_FOR_VALIDATION = 1e-3

diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py
@@ -42,6 +42,10 @@
         "NeuronModelForImageClassification",
         "NeuronModelForSemanticSegmentation",
         "NeuronModelForObjectDetection",
+        "NeuronModelForCTC",
+        "NeuronModelForAudioClassification",
+        "NeuronModelForAudioFrameClassification",
+        "NeuronModelForXVector",
     ],
     "modeling_diffusion": [
         "NeuronStableDiffusionPipelineBase",
@@ -71,7 +75,10 @@
     from .accelerate import ModelParallelismPlugin, NeuronAccelerator, NeuronAcceleratorState, NeuronPartialState
     from .hf_argparser import NeuronHfArgumentParser
     from .modeling import (
+        NeuronModelForAudioClassification,
+        NeuronModelForAudioFrameClassification,
         NeuronModelForCausalLM,
+        NeuronModelForCTC,
         NeuronModelForFeatureExtraction,
         NeuronModelForImageClassification,
         NeuronModelForMaskedLM,
@@ -82,6 +89,7 @@
         NeuronModelForSentenceTransformers,
         NeuronModelForSequenceClassification,
         NeuronModelForTokenClassification,
+        NeuronModelForXVector,
     )
     from .modeling_decoder import NeuronDecoderModel
     from .modeling_diffusion import (