From 6115f3bc090caa6429f474eab49668c0a2fb60a4 Mon Sep 17 00:00:00 2001
From: feldjaeger <feldjaeger@users.noreply.github.com>
Date: Fri, 17 Apr 2026 12:37:44 +0200
Subject: [PATCH] feat: TTS Stack - Chatterbox Multilingual + Whisper STT

- Chatterbox TTS Server (Multilingual, 23 Sprachen, Voice Cloning)
- Whisper STT Server (faster-whisper-small, CPU)
- RTX 4060 GPU auf Tdarr VM (node2, 10.2.1.104)
- Voice Profile: chantal.wav
- Chantal Telegram Voice Integration
---
 compose-chatterbox.yaml | 50 ++++++++++++++++++++++++++++++
 compose.yaml            | 37 ++++++++++++++++++++++
 config-chatterbox.yaml  | 68 +++++++++++++++++++++++++++++++++++++++++
 config.yaml             | 12 ++++++++
 4 files changed, 167 insertions(+)
 create mode 100644 compose-chatterbox.yaml
 create mode 100644 compose.yaml
 create mode 100644 config-chatterbox.yaml
 create mode 100644 config.yaml

diff --git a/compose-chatterbox.yaml b/compose-chatterbox.yaml
new file mode 100644
index 0000000..2a575bd
--- /dev/null
+++ b/compose-chatterbox.yaml
@@ -0,0 +1,50 @@
+services:
+  chatterbox-tts-server:
+    build:
+      args:
+      # Can be nvidia or cpu; Default is Nvidia
+        - RUNTIME=nvidia
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "${PORT:-8004}:8004"
+    volumes:
+      # Mount local config file for persistence
+      - ./config.yaml:/app/config.yaml
+      # Mount local directories for persistent app data
+      - ./voices:/app/voices
+      - ./reference_audio:/app/reference_audio
+      - ./outputs:/app/outputs
+      - ./logs:/app/logs
+      # Named volume for Hugging Face model cache to persist across container rebuilds
+      - hf_cache:/app/hf_cache
+    
+    # --- GPU Support (NVIDIA) ---
+    # The 'deploy' key is the modern way to request GPU resources.
+    # If you get a 'CDI device injection failed' error, comment out the 'deploy' section
+    # and uncomment the 'runtime: nvidia' line below.
+    
+    # Method 1: Modern Docker Compose (Recommended)
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+    # Method 2: Legacy Docker Compose (for older setups)
+    # runtime: nvidia
+
+    restart: unless-stopped
+    environment:
+      - HF_TOKEN=YOUR_TOKEN_HERE
+      # Enable faster Hugging Face downloads inside the container
+      - HF_HUB_ENABLE_HF_TRANSFER=1
+      # Make NVIDIA GPUs visible and specify capabilities for PyTorch
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+# Define the named volume for the Hugging Face cache
+volumes:
+  hf_cache:
diff --git a/compose.yaml b/compose.yaml
new file mode 100644
index 0000000..1471f60
--- /dev/null
+++ b/compose.yaml
@@ -0,0 +1,37 @@
+services:
+  chatterbox-tts:
+    build:
+      context: https://github.com/devnen/Chatterbox-TTS-Server.git
+    container_name: chatterbox-tts
+    restart: unless-stopped
+    ports:
+      - "8004:8004"
+    volumes:
+      - ./config.yaml:/app/config.yaml
+      - ./voices:/app/voices
+      - ./reference_audio:/app/reference_audio
+      - ./outputs:/app/outputs
+      - hf_cache:/app/hf_cache
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+  whisper:
+    image: fedirz/faster-whisper-server:latest-cpu
+    container_name: whisper
+    restart: unless-stopped
+    ports:
+      - "8005:8000"
+    volumes:
+      - whisper-cache:/root/.cache/huggingface
+    environment:
+      - WHISPER__MODEL=Systran/faster-whisper-small
+      - WHISPER__DEVICE=cpu
+
+volumes:
+  hf_cache:
+  whisper-cache:
diff --git a/config-chatterbox.yaml b/config-chatterbox.yaml
new file mode 100644
index 0000000..4d22e11
--- /dev/null
+++ b/config-chatterbox.yaml
@@ -0,0 +1,68 @@
+server:
+  host: 0.0.0.0
+  port: 8004
+  use_ngrok: false
+  use_auth: false
+  auth_username: user
+  auth_password: password
+  log_file_path: logs/tts_server.log
+  log_file_max_size_mb: 10
+  log_file_backup_count: 5
+model:
+  repo_id: chatterbox-multilingual
+tts_engine:
+  device: cuda
+  predefined_voices_path: voices
+  reference_audio_path: reference_audio
+  default_voice_id: Emily.wav
+paths:
+  model_cache: model_cache
+  output: outputs
+generation_defaults:
+  temperature: 0.8
+  exaggeration: 1.3
+  cfg_weight: 0.5
+  seed: 0
+  speed_factor: 1.0
+  language: en
+audio_output:
+  format: wav
+  sample_rate: 24000
+  max_reference_duration_sec: 30
+  save_to_disk: false
+ui_state:
+  last_text: '[gasp] Okay, we have fifteen minutes left! Fifteen minutes! [groan]
+    How have we been in here for forty-five minutes and only solved two puzzles?!
+    [clear throat] Let''s think logically. [sniff] This room smells like fear and
+    bad decisions. [cough] And also someone''s very strong cologne, Kevin. [sigh]
+    Okay, the clue says "the answer lies where time stands still." [chuckle] Great,
+    very helpful, very straightforward. [shush] Everyone stop talking for one second!
+    [gasp] Wait, the clock on the wall! It''s stuck at three fifteen! [laugh] That''s
+    it! That has to be it! [groan] It''s not the combination. Of course it''s not
+    the combination. [sniff] Why would anything in this room be logical? [sigh] Has
+    anyone checked under the carpet? [gasp] There''s a hidden compartment! [chuckle]
+    Kevin, you''re standing on a clue! Classic Kevin! [clear throat] Okay, it''s another
+    riddle. "What has keys but no locks, space but no room, and you can enter but
+    can''t go inside?" [groan] Who writes these things?! [cough] A keyboard! It''s
+    a keyboard! [laugh] There''s a keyboard painted on the wall! [gasp] Eight minutes!
+    We have eight minutes! [sigh] If we don''t escape this room, we''re never speaking
+    of this again. [shush] Focus, everyone! [chuckle] We''re so close I can taste
+    freedom! [sniff] That or more of Kevin''s cologne!
+
+    '
+  last_voice_mode: predefined
+  last_predefined_voice: Emily.wav
+  last_reference_file: Gianna.wav
+  last_seed: 3000
+  last_chunk_size: 240
+  last_split_text_enabled: true
+  hide_chunk_warning: false
+  hide_generation_warning: false
+  theme: light
+  last_preset_name: "\u26A1 Turbo: Escape Room Panic"
+ui:
+  title: Chatterbox TTS Server
+  show_language_select: true
+  max_predefined_voices_in_dropdown: 50
+debug:
+  save_intermediate_audio: false
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 0000000..9310fec
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,12 @@
+server:
+  host: "0.0.0.0"
+  port: 8004
+model:
+  repo_id: "ResembleAI/chatterbox"
+  model_selector: "chatterbox-multilingual"
+tts_engine:
+  device: cuda
+  predefined_voices_path: voices
+  reference_audio_path: reference_audio
+generation_defaults:
+  language: de