diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 7705783e135..fa45cb3cfca 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -104,6 +104,35 @@ jobs: make --jobs=5 --output-sync=target -C backend/python/diffusers make --jobs=5 --output-sync=target -C backend/python/diffusers test + tests-parler-tts: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v4 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential ffmpeg + curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \ + sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \ + gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \ + sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \ + sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \ + sudo apt-get update && \ + sudo apt-get install -y conda + sudo apt-get install -y ca-certificates cmake curl patch python3-pip + sudo apt-get install -y libopencv-dev + pip install --user grpcio-tools + + sudo rm -rfv /usr/bin/conda || true + + - name: Test parler-tts + run: | + export PATH=$PATH:/opt/conda/bin + make --jobs=5 --output-sync=target -C backend/python/parler-tts + make --jobs=5 --output-sync=target -C backend/python/parler-tts test tests-transformers-musicgen: runs-on: ubuntu-latest diff --git a/Dockerfile b/Dockerfile index d0217d50ea2..397fbe22618 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ ARG TARGETVARIANT ENV BUILD_TYPE=${BUILD_TYPE} ENV DEBIAN_FRONTEND=noninteractive -ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh" +ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh" ARG GO_TAGS="stablediffusion tinydream tts" @@ -275,6 +275,9 @@ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ make -C backend/python/transformers-musicgen \ ; fi +RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ + make -C backend/python/parler-tts \ + ; fi RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \ make -C backend/python/coqui \ ; fi diff --git a/Makefile b/Makefile index 1b59c6046cb..d5bc3739956 100644 --- a/Makefile +++ b/Makefile @@ -439,10 +439,10 @@ protogen-go-clean: $(RM) bin/* .PHONY: protogen-python -protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen sentencetransformers-protogen transformers-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen +protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen .PHONY: protogen-python-clean -protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean +protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean .PHONY: autogptq-protogen autogptq-protogen: @@ -524,6 +524,14 @@ transformers-protogen: transformers-protogen-clean: $(MAKE) -C backend/python/transformers protogen-clean +.PHONY: parler-tts-protogen +parler-tts-protogen: + $(MAKE) -C backend/python/parler-tts protogen + +.PHONY: parler-tts-protogen-clean +parler-tts-protogen-clean: + $(MAKE) -C backend/python/parler-tts protogen-clean + .PHONY: transformers-musicgen-protogen transformers-musicgen-protogen: $(MAKE) -C backend/python/transformers-musicgen protogen @@ -560,6 +568,7 @@ prepare-extra-conda-environments: protogen-python $(MAKE) -C backend/python/sentencetransformers $(MAKE) -C backend/python/transformers $(MAKE) -C backend/python/transformers-musicgen + $(MAKE) -C backend/python/parler-tts $(MAKE) -C backend/python/vall-e-x $(MAKE) -C backend/python/exllama $(MAKE) -C backend/python/petals diff --git a/backend/python/parler-tts/Makefile b/backend/python/parler-tts/Makefile new file mode 100644 index 00000000000..4497762ed61 --- /dev/null +++ b/backend/python/parler-tts/Makefile @@ -0,0 +1,39 @@ +export CONDA_ENV_PATH = "parler.yml" +SKIP_CONDA?=0 +ifeq ($(BUILD_TYPE), cublas) +export CONDA_ENV_PATH = "parler-nvidia.yml" +endif + +# Intel GPU are supposed to have dependencies installed in the main python +# environment, so we skip conda installation for SYCL builds. +# https://github.com/intel/intel-extension-for-pytorch/issues/538 +ifneq (,$(findstring sycl,$(BUILD_TYPE))) +export SKIP_CONDA=1 +endif + +.PHONY: parler-tts +parler-tts: protogen + @echo "Installing $(CONDA_ENV_PATH)..." + bash install.sh $(CONDA_ENV_PATH) + +.PHONY: run +run: protogen + @echo "Running transformers..." + bash run.sh + @echo "transformers run." + +.PHONY: test +test: protogen + @echo "Testing transformers..." + bash test.sh + @echo "transformers tested." + +.PHONY: protogen +protogen: backend_pb2_grpc.py backend_pb2.py + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +backend_pb2_grpc.py backend_pb2.py: + python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto \ No newline at end of file diff --git a/backend/python/parler-tts/install.sh b/backend/python/parler-tts/install.sh new file mode 100755 index 00000000000..b9965b230fa --- /dev/null +++ b/backend/python/parler-tts/install.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -ex + +SKIP_CONDA=${SKIP_CONDA:-0} + +# Check if environment exist +conda_env_exists(){ + ! conda list --name "${@}" >/dev/null 2>/dev/null +} + +if [ $SKIP_CONDA -eq 1 ]; then + echo "Skipping conda environment installation" +else + export PATH=$PATH:/opt/conda/bin + if conda_env_exists "parler" ; then + echo "Creating virtual environment..." + conda env create --name parler --file $1 + echo "Virtual environment created." + else + echo "Virtual environment already exists." + fi +fi + +if [ $SKIP_CONDA -ne 1 ]; then + # Activate conda environment + source activate parler + # https://github.com/descriptinc/audiotools/issues/101 + # incompatible protobuf versions. + curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o $CONDA_PREFIX/lib/python3.11/site-packages/google/protobuf/internal/builder.py +fi + +if [ "$PIP_CACHE_PURGE" = true ] ; then + if [ $SKIP_CONDA -ne 1 ]; then + # Activate conda environment + source activate parler + fi + + pip cache purge +fi \ No newline at end of file diff --git a/backend/python/parler-tts/parler-nvidia.yml b/backend/python/parler-tts/parler-nvidia.yml new file mode 100644 index 00000000000..ed925e94911 --- /dev/null +++ b/backend/python/parler-tts/parler-nvidia.yml @@ -0,0 +1,48 @@ +name: parler +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2023.08.22=h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.4=h6a678d5_0 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.11=h7f8727e_2 + - pip=23.2.1=py311h06a4308_0 + - python=3.11.5=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=68.0.0=py311h06a4308_0 + - sqlite=3.41.2=h5eee18b_0 + - tk=8.6.12=h1ccaba5_0 + - tzdata=2023c=h04d1e81_0 + - wheel=0.41.2=py311h06a4308_0 + - xz=5.4.2=h5eee18b_0 + - zlib=1.2.13=h5eee18b_0 + - pip: + - accelerate>=0.11.0 + - grpcio==1.59.0 + - numpy==1.26.0 + - nvidia-cublas-cu12==12.1.3.1 + - nvidia-cuda-cupti-cu12==12.1.105 + - nvidia-cuda-nvrtc-cu12==12.1.105 + - nvidia-cuda-runtime-cu12==12.1.105 + - nvidia-cudnn-cu12==8.9.2.26 + - nvidia-cufft-cu12==11.0.2.54 + - nvidia-curand-cu12==10.3.2.106 + - nvidia-cusolver-cu12==11.4.5.107 + - nvidia-cusparse-cu12==12.1.0.106 + - nvidia-nccl-cu12==2.18.1 + - nvidia-nvjitlink-cu12==12.2.140 + - nvidia-nvtx-cu12==12.1.105 + - torch==2.1.0 + - transformers>=4.34.0 + - descript-audio-codec + - sentencepiece + - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16 +prefix: /opt/conda/envs/diffusers diff --git a/backend/python/parler-tts/parler.yml b/backend/python/parler-tts/parler.yml new file mode 100644 index 00000000000..fd0c3cb6a5c --- /dev/null +++ b/backend/python/parler-tts/parler.yml @@ -0,0 +1,36 @@ +name: parler +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h7b6447c_0 + - ca-certificates=2023.08.22=h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.4=h6a678d5_0 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.11=h7f8727e_2 + - pip=23.2.1=py311h06a4308_0 + - python=3.11.5=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=68.0.0=py311h06a4308_0 + - sqlite=3.41.2=h5eee18b_0 + - tk=8.6.12=h1ccaba5_0 + - tzdata=2023c=h04d1e81_0 + - wheel=0.41.2=py311h06a4308_0 + - xz=5.4.2=h5eee18b_0 + - zlib=1.2.13=h5eee18b_0 + - pip: + - accelerate>=0.11.0 + - numpy==1.26.0 + - grpcio==1.59.0 + - torch==2.1.0 + - transformers>=4.34.0 + - descript-audio-codec + - sentencepiece + - git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16 +prefix: /opt/conda/envs/parler diff --git a/backend/python/parler-tts/parler_tts_server.py b/backend/python/parler-tts/parler_tts_server.py new file mode 100644 index 00000000000..655990d7572 --- /dev/null +++ b/backend/python/parler-tts/parler_tts_server.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +""" +Extra gRPC server for MusicgenForConditionalGeneration models. +""" +from concurrent import futures + +import argparse +import signal +import sys +import os + +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + +from scipy.io.wavfile import write as write_wav + +from parler_tts import ParlerTTSForConditionalGeneration +from transformers import AutoTokenizer +import soundfile as sf +import torch + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + A gRPC servicer for the backend service. + + This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding. + """ + def Health(self, request, context): + """ + A gRPC method that returns the health status of the backend service. + + Args: + request: A HealthRequest object that contains the request parameters. + context: A grpc.ServicerContext object that provides information about the RPC. + + Returns: + A Reply object that contains the health status of the backend service. + """ + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + def LoadModel(self, request, context): + """ + A gRPC method that loads a model into memory. + + Args: + request: A LoadModelRequest object that contains the request parameters. + context: A grpc.ServicerContext object that provides information about the RPC. + + Returns: + A Result object that contains the result of the LoadModel operation. + """ + model_name = request.Model + device = "cuda:0" if torch.cuda.is_available() else "cpu" + try: + self.model = ParlerTTSForConditionalGeneration.from_pretrained(model_name).to(device) + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + def TTS(self, request, context): + model_name = request.model + voice = request.voice + if voice == "": + voice = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast." + if model_name == "": + return backend_pb2.Result(success=False, message="request.model is required") + try: + device = "cuda:0" if torch.cuda.is_available() else "cpu" + input_ids = self.tokenizer(voice, return_tensors="pt").input_ids.to(device) + prompt_input_ids = self.tokenizer(request.text, return_tensors="pt").input_ids.to(device) + + generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) + audio_arr = generation.cpu().numpy().squeeze() + print("[parler-tts] TTS generated!", file=sys.stderr) + sf.write(request.dst, audio_arr, self.model.config.sampling_rate) + print("[parler-tts] TTS saved to", request.dst, file=sys.stderr) + print("[parler-tts] TTS for", file=sys.stderr) + print(request, file=sys.stderr) + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + return backend_pb2.Result(success=True) + + +def serve(address): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("[parler-tts] Server started. Listening on: " + address, file=sys.stderr) + + # Define the signal handler function + def signal_handler(sig, frame): + print("[parler-tts] Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + # Set the signal handlers for SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + print(f"[parler-tts] startup: {args}", file=sys.stderr) + serve(args.addr) diff --git a/backend/python/parler-tts/run.sh b/backend/python/parler-tts/run.sh new file mode 100644 index 00000000000..08e42198521 --- /dev/null +++ b/backend/python/parler-tts/run.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +## +## A bash script wrapper that runs the parler-tts server with conda + +echo "Launching gRPC server for parler-tts" + +export PATH=$PATH:/opt/conda/bin + +# Activate conda environment +source activate parler + +# get the directory where the bash script is located +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +python $DIR/parler_tts_server.py $@ diff --git a/backend/python/parler-tts/test.sh b/backend/python/parler-tts/test.sh new file mode 100644 index 00000000000..1bd15fd1508 --- /dev/null +++ b/backend/python/parler-tts/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +## +## A bash script wrapper that runs the transformers server with conda + +# Activate conda environment +source activate parler + +# get the directory where the bash script is located +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +python -m unittest $DIR/test_parler.py \ No newline at end of file diff --git a/backend/python/parler-tts/test_parler.py b/backend/python/parler-tts/test_parler.py new file mode 100644 index 00000000000..ce9b66acaa9 --- /dev/null +++ b/backend/python/parler-tts/test_parler.py @@ -0,0 +1,81 @@ +""" +A test script to test the gRPC service +""" +import unittest +import subprocess +import time +import backend_pb2 +import backend_pb2_grpc + +import grpc + + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service + """ + def setUp(self): + """ + This method sets up the gRPC service by starting the server + """ + self.service = subprocess.Popen(["python3", "parler_tts_server.py", "--addr", "localhost:50051"]) + time.sleep(10) + + def tearDown(self) -> None: + """ + This method tears down the gRPC service by terminating the server + """ + self.service.terminate() + self.service.wait() + + def test_server_startup(self): + """ + This method tests if the server starts up successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.Health(backend_pb2.HealthMessage()) + self.assertEqual(response.message, b'OK') + except Exception as err: + print(err) + self.fail("Server failed to start") + finally: + self.tearDown() + + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1")) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() + + def test_tts(self): + """ + This method tests if the embeddings are generated successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions(Model="parler-tts/parler_tts_mini_v0.1")) + self.assertTrue(response.success) + tts_request = backend_pb2.TTSRequest(text="Hey, how are you doing today?") + tts_response = stub.TTS(tts_request) + self.assertIsNotNone(tts_response) + except Exception as err: + print(err) + self.fail("TTS service failed") + finally: + self.tearDown() \ No newline at end of file diff --git a/backend/python/transformers-musicgen/run.sh b/backend/python/transformers-musicgen/run.sh index d3dcb968155..3d3ffcfd63b 100644 --- a/backend/python/transformers-musicgen/run.sh +++ b/backend/python/transformers-musicgen/run.sh @@ -8,7 +8,7 @@ echo "Launching gRPC server for transformers-musicgen" export PATH=$PATH:/opt/conda/bin # Activate conda environment -source activate transformers-musicgen +source activate transformers # get the directory where the bash script is located DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"