Skip to content

Commit c85d559

Browse files
authored
feat(chatterbox): support multilingual (#6240)
* feat(chatterbox): support multilingual Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Add l4t support Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: switch to fork Until resemble-ai/chatterbox#295 is merged Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
1 parent b5efc4f commit c85d559

File tree

11 files changed

+107
-21
lines changed

11 files changed

+107
-21
lines changed

.github/workflows/backend.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,18 @@ jobs:
955955
backend: "exllama2"
956956
dockerfile: "./backend/Dockerfile.python"
957957
context: "./backend"
958+
- build-type: 'cublas'
959+
cuda-major-version: "12"
960+
cuda-minor-version: "0"
961+
platforms: 'linux/arm64'
962+
skip-drivers: 'true'
963+
tag-latest: 'auto'
964+
tag-suffix: '-nvidia-l4t-arm64-chatterbox'
965+
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
966+
runs-on: 'ubuntu-24.04-arm'
967+
backend: "chatterbox"
968+
dockerfile: "./backend/Dockerfile.python"
969+
context: "./backend"
958970
# runs out of space on the runner
959971
# - build-type: 'hipblas'
960972
# cuda-major-version: ""

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,9 @@ docker-build-kitten-tts:
429429
docker-save-kitten-tts: backend-images
430430
docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar
431431

432+
docker-save-chatterbox: backend-images
433+
docker save local-ai-backend:chatterbox -o backend-images/chatterbox.tar
434+
432435
docker-build-kokoro:
433436
docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend
434437

backend/index.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@
353353
nvidia: "cuda12-chatterbox"
354354
metal: "metal-chatterbox"
355355
default: "cpu-chatterbox"
356+
nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
356357
- &piper
357358
name: "piper"
358359
uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1239,6 +1240,7 @@
12391240
nvidia: "cuda12-chatterbox-development"
12401241
metal: "metal-chatterbox-development"
12411242
default: "cpu-chatterbox-development"
1243+
nvidia-l4t: "nvidia-l4t-arm64-chatterbox"
12421244
- !!merge <<: *chatterbox
12431245
name: "cpu-chatterbox"
12441246
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
@@ -1249,6 +1251,16 @@
12491251
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
12501252
mirrors:
12511253
- localai/localai-backends:master-cpu-chatterbox
1254+
- !!merge <<: *chatterbox
1255+
name: "nvidia-l4t-arm64-chatterbox"
1256+
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox"
1257+
mirrors:
1258+
- localai/localai-backends:latest-gpu-nvidia-l4t-arm64-chatterbox
1259+
- !!merge <<: *chatterbox
1260+
name: "nvidia-l4t-arm64-chatterbox-development"
1261+
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-l4t-arm64-chatterbox"
1262+
mirrors:
1263+
- localai/localai-backends:master-gpu-nvidia-l4t-arm64-chatterbox
12521264
- !!merge <<: *chatterbox
12531265
name: "metal-chatterbox"
12541266
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"

backend/python/chatterbox/backend.py

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,23 @@
1414
import torch
1515
import torchaudio as ta
1616
from chatterbox.tts import ChatterboxTTS
17-
17+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS
1818
import grpc
1919

20+
def is_float(s):
21+
"""Check if a string can be converted to float."""
22+
try:
23+
float(s)
24+
return True
25+
except ValueError:
26+
return False
27+
def is_int(s):
28+
"""Check if a string can be converted to int."""
29+
try:
30+
int(s)
31+
return True
32+
except ValueError:
33+
return False
2034

2135
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
2236

@@ -47,6 +61,28 @@ def LoadModel(self, request, context):
4761
if not torch.cuda.is_available() and request.CUDA:
4862
return backend_pb2.Result(success=False, message="CUDA is not available")
4963

64+
65+
options = request.Options
66+
67+
# empty dict
68+
self.options = {}
69+
70+
# The options are a list of strings in this form optname:optvalue
71+
# We are storing all the options in a dict so we can use it later when
72+
# generating the images
73+
for opt in options:
74+
if ":" not in opt:
75+
continue
76+
key, value = opt.split(":")
77+
# if value is a number, convert it to the appropriate type
78+
if is_float(value):
79+
value = float(value)
80+
elif is_int(value):
81+
value = int(value)
82+
elif value.lower() in ["true", "false"]:
83+
value = value.lower() == "true"
84+
self.options[key] = value
85+
5086
self.AudioPath = None
5187

5288
if os.path.isabs(request.AudioPath):
@@ -56,10 +92,14 @@ def LoadModel(self, request, context):
5692
modelFileBase = os.path.dirname(request.ModelFile)
5793
# modify LoraAdapter to be relative to modelFileBase
5894
self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
59-
6095
try:
6196
print("Preparing models, please wait", file=sys.stderr)
62-
self.model = ChatterboxTTS.from_pretrained(device=device)
97+
if "multilingual" in self.options:
98+
# remove key from options
99+
del self.options["multilingual"]
100+
self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
101+
else:
102+
self.model = ChatterboxTTS.from_pretrained(device=device)
63103
except Exception as err:
64104
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
65105
# Implement your logic here for the LoadModel service
@@ -68,12 +108,18 @@ def LoadModel(self, request, context):
68108

69109
def TTS(self, request, context):
70110
try:
71-
# Generate audio using ChatterboxTTS
111+
kwargs = {}
112+
113+
if "language" in self.options:
114+
kwargs["language_id"] = self.options["language"]
72115
if self.AudioPath is not None:
73-
wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
74-
else:
75-
wav = self.model.generate(request.text)
76-
116+
kwargs["audio_prompt_path"] = self.AudioPath
117+
118+
# add options to kwargs
119+
kwargs.update(self.options)
120+
121+
# Generate audio using ChatterboxTTS
122+
wav = self.model.generate(request.text, **kwargs)
77123
# Save the generated audio
78124
ta.save(request.dst, wav, self.model.sr)
79125

backend/python/chatterbox/install.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@ fi
1515
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
1616
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
1717
fi
18+
EXTRA_PIP_INSTALL_FLAGS+=" --no-build-isolation"
1819

1920
installRequirements
Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
--extra-index-url https://download.pytorch.org/whl/cpu
22
accelerate
3-
torch==2.6.0
4-
torchaudio==2.6.0
5-
transformers==4.46.3
6-
chatterbox-tts==0.1.2
3+
torch
4+
torchaudio
5+
transformers
6+
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
7+
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
8+
#chatterbox-tts==0.1.4

backend/python/chatterbox/requirements-cublas11.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@
22
torch==2.6.0+cu118
33
torchaudio==2.6.0+cu118
44
transformers==4.46.3
5-
chatterbox-tts==0.1.2
5+
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
6+
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
67
accelerate
Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
torch==2.6.0
2-
torchaudio==2.6.0
3-
transformers==4.46.3
4-
chatterbox-tts==0.1.2
1+
torch
2+
torchaudio
3+
transformers
4+
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
5+
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
56
accelerate
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
--extra-index-url https://download.pytorch.org/whl/rocm6.0
22
torch==2.6.0+rocm6.1
33
torchaudio==2.6.0+rocm6.1
4-
transformers==4.46.3
5-
chatterbox-tts==0.1.2
4+
transformers
5+
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
6+
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
67
accelerate

backend/python/chatterbox/requirements-intel.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
intel-extension-for-pytorch==2.3.110+xpu
33
torch==2.3.1+cxx11.abi
44
torchaudio==2.3.1+cxx11.abi
5-
transformers==4.46.3
6-
chatterbox-tts==0.1.2
5+
transformers
6+
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
7+
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster
78
accelerate
89
oneccl_bind_pt==2.3.100+xpu
910
optimum[openvino]

0 commit comments

Comments
 (0)