feat: TTS Stack - Chatterbox Multilingual + Whisper STT

- Chatterbox TTS Server (Multilingual, 23 Sprachen, Voice Cloning)
- Whisper STT Server (faster-whisper-small, CPU)
- RTX 4060 GPU auf Tdarr VM (node2, 10.2.1.104)
- Voice Profile: chantal.wav
- Chantal Telegram Voice Integration
This commit is contained in:
feldjaeger 2026-04-17 12:37:44 +02:00
commit 6115f3bc09
4 changed files with 167 additions and 0 deletions

50
compose-chatterbox.yaml Normal file
View file

@ -0,0 +1,50 @@
services:
chatterbox-tts-server:
build:
args:
# Can be nvidia or cpu; Default is Nvidia
- RUNTIME=nvidia
context: .
dockerfile: Dockerfile
ports:
- "${PORT:-8004}:8004"
volumes:
# Mount local config file for persistence
- ./config.yaml:/app/config.yaml
# Mount local directories for persistent app data
- ./voices:/app/voices
- ./reference_audio:/app/reference_audio
- ./outputs:/app/outputs
- ./logs:/app/logs
# Named volume for Hugging Face model cache to persist across container rebuilds
- hf_cache:/app/hf_cache
# --- GPU Support (NVIDIA) ---
# The 'deploy' key is the modern way to request GPU resources.
# If you get a 'CDI device injection failed' error, comment out the 'deploy' section
# and uncomment the 'runtime: nvidia' line below.
# Method 1: Modern Docker Compose (Recommended)
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
# Method 2: Legacy Docker Compose (for older setups)
# runtime: nvidia
restart: unless-stopped
environment:
- HF_TOKEN=YOUR_TOKEN_HERE
# Enable faster Hugging Face downloads inside the container
- HF_HUB_ENABLE_HF_TRANSFER=1
# Make NVIDIA GPUs visible and specify capabilities for PyTorch
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
# Define the named volume for the Hugging Face cache
volumes:
hf_cache:

37
compose.yaml Normal file
View file

@ -0,0 +1,37 @@
services:
chatterbox-tts:
build:
context: https://github.com/devnen/Chatterbox-TTS-Server.git
container_name: chatterbox-tts
restart: unless-stopped
ports:
- "8004:8004"
volumes:
- ./config.yaml:/app/config.yaml
- ./voices:/app/voices
- ./reference_audio:/app/reference_audio
- ./outputs:/app/outputs
- hf_cache:/app/hf_cache
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
whisper:
image: fedirz/faster-whisper-server:latest-cpu
container_name: whisper
restart: unless-stopped
ports:
- "8005:8000"
volumes:
- whisper-cache:/root/.cache/huggingface
environment:
- WHISPER__MODEL=Systran/faster-whisper-small
- WHISPER__DEVICE=cpu
volumes:
hf_cache:
whisper-cache:

68
config-chatterbox.yaml Normal file
View file

@ -0,0 +1,68 @@
server:
host: 0.0.0.0
port: 8004
use_ngrok: false
use_auth: false
auth_username: user
auth_password: password
log_file_path: logs/tts_server.log
log_file_max_size_mb: 10
log_file_backup_count: 5
model:
repo_id: chatterbox-multilingual
tts_engine:
device: cuda
predefined_voices_path: voices
reference_audio_path: reference_audio
default_voice_id: Emily.wav
paths:
model_cache: model_cache
output: outputs
generation_defaults:
temperature: 0.8
exaggeration: 1.3
cfg_weight: 0.5
seed: 0
speed_factor: 1.0
language: en
audio_output:
format: wav
sample_rate: 24000
max_reference_duration_sec: 30
save_to_disk: false
ui_state:
last_text: '[gasp] Okay, we have fifteen minutes left! Fifteen minutes! [groan]
How have we been in here for forty-five minutes and only solved two puzzles?!
[clear throat] Let''s think logically. [sniff] This room smells like fear and
bad decisions. [cough] And also someone''s very strong cologne, Kevin. [sigh]
Okay, the clue says "the answer lies where time stands still." [chuckle] Great,
very helpful, very straightforward. [shush] Everyone stop talking for one second!
[gasp] Wait, the clock on the wall! It''s stuck at three fifteen! [laugh] That''s
it! That has to be it! [groan] It''s not the combination. Of course it''s not
the combination. [sniff] Why would anything in this room be logical? [sigh] Has
anyone checked under the carpet? [gasp] There''s a hidden compartment! [chuckle]
Kevin, you''re standing on a clue! Classic Kevin! [clear throat] Okay, it''s another
riddle. "What has keys but no locks, space but no room, and you can enter but
can''t go inside?" [groan] Who writes these things?! [cough] A keyboard! It''s
a keyboard! [laugh] There''s a keyboard painted on the wall! [gasp] Eight minutes!
We have eight minutes! [sigh] If we don''t escape this room, we''re never speaking
of this again. [shush] Focus, everyone! [chuckle] We''re so close I can taste
freedom! [sniff] That or more of Kevin''s cologne!
'
last_voice_mode: predefined
last_predefined_voice: Emily.wav
last_reference_file: Gianna.wav
last_seed: 3000
last_chunk_size: 240
last_split_text_enabled: true
hide_chunk_warning: false
hide_generation_warning: false
theme: light
last_preset_name: "\u26A1 Turbo: Escape Room Panic"
ui:
title: Chatterbox TTS Server
show_language_select: true
max_predefined_voices_in_dropdown: 50
debug:
save_intermediate_audio: false

12
config.yaml Normal file
View file

@ -0,0 +1,12 @@
server:
host: "0.0.0.0"
port: 8004
model:
repo_id: "ResembleAI/chatterbox"
model_selector: "chatterbox-multilingual"
tts_engine:
device: cuda
predefined_voices_path: voices
reference_audio_path: reference_audio
generation_defaults:
language: de