Switch to libritts_r medium model and use piper-phonemize

rhasspy · Oct 6, 2023 · 48e5492 · 48e5492
1 parent 87f2498
commit 48e5492
Show file tree

Hide file tree

Showing 4 changed files with 1,419 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -17,10 +17,10 @@ python3 -m pip install --upgrade pip
 python3 -m pip install -r requirements.txt
 ```
 
-Download the LibriTTS generator:
+Download the LibriTTS-R generator (exported from [checkpoint](https://huggingface.co/datasets/rhasspy/piper-checkpoints/tree/main/en/en_US/libritts_r/medium)):
 
 ``` sh
-wget -O models/en-us-libritts-high.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v1.0.0/en-us-libritts-high.pt'
+wget -O models/en-us-libritts-high.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'
 ```
 
 
@@ -42,7 +42,7 @@ python3 generate_samples.py 'okay, piper.' --max-samples 100 --batch-size 10 --o
 
 On an NVidia 2080 Ti with 11GB, a batch size of 100 was possible (generating approximately 100 samples per second).
 
-Setting `--max-speakers` to a value less than 904 (the number if LibriTTS) is recommended. Because very few samples of later speakers were in the original dataset, using them can cause audio artifacts.
+Setting `--max-speakers` to a value less than 904 (the number of speakers LibriTTS) is recommended. Because very few samples of later speakers were in the original dataset, using them can cause audio artifacts.
 
 See `--help` for more options, including adjust the `--length-scales` (speaking speeds) and `--slerp-weights` (speaker blending) which are cycled per batch.
 

diff --git a/generate_samples.py b/generate_samples.py
@@ -10,7 +10,7 @@
 import numpy as np
 import torch
 
-from espeak_phonemizer import Phonemizer
+from piper_phonemize import phonemize_espeak, phoneme_ids_espeak
 from piper_train.vits import commons
 
 _DIR = Path(__file__).parent
@@ -21,7 +21,9 @@ def main() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument("text")
     parser.add_argument("--max-samples", required=True, type=int)
-    parser.add_argument("--model", default=_DIR / "models" / "en-us-libritts-high.pt")
+    parser.add_argument(
+        "--model", default=_DIR / "models" / "en_US-libritts_r-medium.pt"
+    )
     parser.add_argument("--batch-size", type=int, default=1)
     parser.add_argument("--slerp-weights", nargs="+", type=float, default=[0.5])
     parser.add_argument(
@@ -61,9 +63,12 @@ def main() -> None:
     if args.max_speakers is not None:
         num_speakers = min(num_speakers, args.max_speakers)
 
-    phonemizer = Phonemizer(voice)
-    phonemes_str = phonemizer.phonemize(args.text)
-    phonemes = list(unicodedata.normalize("NFD", phonemes_str))
+    # Combine all sentences
+    phonemes = [
+        p
+        for sentence_phonemes in phonemize_espeak(args.text, voice)
+        for p in sentence_phonemes
+    ]
     _LOGGER.debug("Phonemes: %s", phonemes)
 
     id_map = config["phoneme_id_map"]