feat: TTS Stack - Chatterbox Multilingual + Whisper STT

- Chatterbox TTS Server (Multilingual, 23 Sprachen, Voice Cloning) - Whisper STT Server (faster-whisper-small, CPU) - RTX 4060 GPU auf Tdarr VM (node2, 10.2.1.104) - Voice Profile: chantal.wav - Chantal Telegram Voice Integration
2026-04-17 12:37:44 +02:00 · 2026-04-17 12:37:44 +02:00 · 6115f3bc09
commit 6115f3bc09
4 changed files with 167 additions and 0 deletions
--- a/compose-chatterbox.yaml
+++ b/compose-chatterbox.yaml
@ -0,0 +1,50 @@
 services:
  chatterbox-tts-server:
    build:
      args:
      # Can be nvidia or cpu; Default is Nvidia
        - RUNTIME=nvidia
      context: .
      dockerfile: Dockerfile
    ports:
      - "${PORT:-8004}:8004"
    volumes:
      # Mount local config file for persistence
      - ./config.yaml:/app/config.yaml
      # Mount local directories for persistent app data
      - ./voices:/app/voices
      - ./reference_audio:/app/reference_audio
      - ./outputs:/app/outputs
      - ./logs:/app/logs
      # Named volume for Hugging Face model cache to persist across container rebuilds
      - hf_cache:/app/hf_cache
    # --- GPU Support (NVIDIA) ---
    # The 'deploy' key is the modern way to request GPU resources.
    # If you get a 'CDI device injection failed' error, comment out the 'deploy' section
    # and uncomment the 'runtime: nvidia' line below.
    # Method 1: Modern Docker Compose (Recommended)
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    # Method 2: Legacy Docker Compose (for older setups)
    # runtime: nvidia
    restart: unless-stopped
    environment:
      - HF_TOKEN=YOUR_TOKEN_HERE
      # Enable faster Hugging Face downloads inside the container
      - HF_HUB_ENABLE_HF_TRANSFER=1
      # Make NVIDIA GPUs visible and specify capabilities for PyTorch
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
 # Define the named volume for the Hugging Face cache
 volumes:
  hf_cache:
--- a/compose.yaml
+++ b/compose.yaml
@ -0,0 +1,37 @@
 services:
  chatterbox-tts:
    build:
      context: https://github.com/devnen/Chatterbox-TTS-Server.git
    container_name: chatterbox-tts
    restart: unless-stopped
    ports:
      - "8004:8004"
    volumes:
      - ./config.yaml:/app/config.yaml
      - ./voices:/app/voices
      - ./reference_audio:/app/reference_audio
      - ./outputs:/app/outputs
      - hf_cache:/app/hf_cache
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
  whisper:
    image: fedirz/faster-whisper-server:latest-cpu
    container_name: whisper
    restart: unless-stopped
    ports:
      - "8005:8000"
    volumes:
      - whisper-cache:/root/.cache/huggingface
    environment:
      - WHISPER__MODEL=Systran/faster-whisper-small
      - WHISPER__DEVICE=cpu
 volumes:
  hf_cache:
  whisper-cache:
--- a/config-chatterbox.yaml
+++ b/config-chatterbox.yaml
@ -0,0 +1,68 @@
 server:
  host: 0.0.0.0
  port: 8004
  use_ngrok: false
  use_auth: false
  auth_username: user
  auth_password: password
  log_file_path: logs/tts_server.log
  log_file_max_size_mb: 10
  log_file_backup_count: 5
 model:
  repo_id: chatterbox-multilingual
 tts_engine:
  device: cuda
  predefined_voices_path: voices
  reference_audio_path: reference_audio
  default_voice_id: Emily.wav
 paths:
  model_cache: model_cache
  output: outputs
 generation_defaults:
  temperature: 0.8
  exaggeration: 1.3
  cfg_weight: 0.5
  seed: 0
  speed_factor: 1.0
  language: en
 audio_output:
  format: wav
  sample_rate: 24000
  max_reference_duration_sec: 30
  save_to_disk: false
 ui_state:
  last_text: '[gasp] Okay, we have fifteen minutes left! Fifteen minutes! [groan]
    How have we been in here for forty-five minutes and only solved two puzzles?!
    [clear throat] Let''s think logically. [sniff] This room smells like fear and
    bad decisions. [cough] And also someone''s very strong cologne, Kevin. [sigh]
    Okay, the clue says "the answer lies where time stands still." [chuckle] Great,
    very helpful, very straightforward. [shush] Everyone stop talking for one second!
    [gasp] Wait, the clock on the wall! It''s stuck at three fifteen! [laugh] That''s
    it! That has to be it! [groan] It''s not the combination. Of course it''s not
    the combination. [sniff] Why would anything in this room be logical? [sigh] Has
    anyone checked under the carpet? [gasp] There''s a hidden compartment! [chuckle]
    Kevin, you''re standing on a clue! Classic Kevin! [clear throat] Okay, it''s another
    riddle. "What has keys but no locks, space but no room, and you can enter but
    can''t go inside?" [groan] Who writes these things?! [cough] A keyboard! It''s
    a keyboard! [laugh] There''s a keyboard painted on the wall! [gasp] Eight minutes!
    We have eight minutes! [sigh] If we don''t escape this room, we''re never speaking
    of this again. [shush] Focus, everyone! [chuckle] We''re so close I can taste
    freedom! [sniff] That or more of Kevin''s cologne!
    '
  last_voice_mode: predefined
  last_predefined_voice: Emily.wav
  last_reference_file: Gianna.wav
  last_seed: 3000
  last_chunk_size: 240
  last_split_text_enabled: true
  hide_chunk_warning: false
  hide_generation_warning: false
  theme: light
  last_preset_name: "\u26A1 Turbo: Escape Room Panic"
 ui:
  title: Chatterbox TTS Server
  show_language_select: true
  max_predefined_voices_in_dropdown: 50
 debug:
  save_intermediate_audio: false
--- a/config.yaml
+++ b/config.yaml
@ -0,0 +1,12 @@
 server:
  host: "0.0.0.0"
  port: 8004
 model:
  repo_id: "ResembleAI/chatterbox"
  model_selector: "chatterbox-multilingual"
 tts_engine:
  device: cuda
  predefined_voices_path: voices
  reference_audio_path: reference_audio
 generation_defaults:
  language: de