From 6115f3bc090caa6429f474eab49668c0a2fb60a4 Mon Sep 17 00:00:00 2001 From: feldjaeger Date: Fri, 17 Apr 2026 12:37:44 +0200 Subject: [PATCH] feat: TTS Stack - Chatterbox Multilingual + Whisper STT - Chatterbox TTS Server (Multilingual, 23 Sprachen, Voice Cloning) - Whisper STT Server (faster-whisper-small, CPU) - RTX 4060 GPU auf Tdarr VM (node2, 10.2.1.104) - Voice Profile: chantal.wav - Chantal Telegram Voice Integration --- compose-chatterbox.yaml | 50 ++++++++++++++++++++++++++++++ compose.yaml | 37 ++++++++++++++++++++++ config-chatterbox.yaml | 68 +++++++++++++++++++++++++++++++++++++++++ config.yaml | 12 ++++++++ 4 files changed, 167 insertions(+) create mode 100644 compose-chatterbox.yaml create mode 100644 compose.yaml create mode 100644 config-chatterbox.yaml create mode 100644 config.yaml diff --git a/compose-chatterbox.yaml b/compose-chatterbox.yaml new file mode 100644 index 0000000..2a575bd --- /dev/null +++ b/compose-chatterbox.yaml @@ -0,0 +1,50 @@ +services: + chatterbox-tts-server: + build: + args: + # Can be nvidia or cpu; Default is Nvidia + - RUNTIME=nvidia + context: . + dockerfile: Dockerfile + ports: + - "${PORT:-8004}:8004" + volumes: + # Mount local config file for persistence + - ./config.yaml:/app/config.yaml + # Mount local directories for persistent app data + - ./voices:/app/voices + - ./reference_audio:/app/reference_audio + - ./outputs:/app/outputs + - ./logs:/app/logs + # Named volume for Hugging Face model cache to persist across container rebuilds + - hf_cache:/app/hf_cache + + # --- GPU Support (NVIDIA) --- + # The 'deploy' key is the modern way to request GPU resources. + # If you get a 'CDI device injection failed' error, comment out the 'deploy' section + # and uncomment the 'runtime: nvidia' line below. + + # Method 1: Modern Docker Compose (Recommended) + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + + # Method 2: Legacy Docker Compose (for older setups) + # runtime: nvidia + + restart: unless-stopped + environment: + - HF_TOKEN=YOUR_TOKEN_HERE + # Enable faster Hugging Face downloads inside the container + - HF_HUB_ENABLE_HF_TRANSFER=1 + # Make NVIDIA GPUs visible and specify capabilities for PyTorch + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=compute,utility + +# Define the named volume for the Hugging Face cache +volumes: + hf_cache: diff --git a/compose.yaml b/compose.yaml new file mode 100644 index 0000000..1471f60 --- /dev/null +++ b/compose.yaml @@ -0,0 +1,37 @@ +services: + chatterbox-tts: + build: + context: https://github.com/devnen/Chatterbox-TTS-Server.git + container_name: chatterbox-tts + restart: unless-stopped + ports: + - "8004:8004" + volumes: + - ./config.yaml:/app/config.yaml + - ./voices:/app/voices + - ./reference_audio:/app/reference_audio + - ./outputs:/app/outputs + - hf_cache:/app/hf_cache + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + + whisper: + image: fedirz/faster-whisper-server:latest-cpu + container_name: whisper + restart: unless-stopped + ports: + - "8005:8000" + volumes: + - whisper-cache:/root/.cache/huggingface + environment: + - WHISPER__MODEL=Systran/faster-whisper-small + - WHISPER__DEVICE=cpu + +volumes: + hf_cache: + whisper-cache: diff --git a/config-chatterbox.yaml b/config-chatterbox.yaml new file mode 100644 index 0000000..4d22e11 --- /dev/null +++ b/config-chatterbox.yaml @@ -0,0 +1,68 @@ +server: + host: 0.0.0.0 + port: 8004 + use_ngrok: false + use_auth: false + auth_username: user + auth_password: password + log_file_path: logs/tts_server.log + log_file_max_size_mb: 10 + log_file_backup_count: 5 +model: + repo_id: chatterbox-multilingual +tts_engine: + device: cuda + predefined_voices_path: voices + reference_audio_path: reference_audio + default_voice_id: Emily.wav +paths: + model_cache: model_cache + output: outputs +generation_defaults: + temperature: 0.8 + exaggeration: 1.3 + cfg_weight: 0.5 + seed: 0 + speed_factor: 1.0 + language: en +audio_output: + format: wav + sample_rate: 24000 + max_reference_duration_sec: 30 + save_to_disk: false +ui_state: + last_text: '[gasp] Okay, we have fifteen minutes left! Fifteen minutes! [groan] + How have we been in here for forty-five minutes and only solved two puzzles?! + [clear throat] Let''s think logically. [sniff] This room smells like fear and + bad decisions. [cough] And also someone''s very strong cologne, Kevin. [sigh] + Okay, the clue says "the answer lies where time stands still." [chuckle] Great, + very helpful, very straightforward. [shush] Everyone stop talking for one second! + [gasp] Wait, the clock on the wall! It''s stuck at three fifteen! [laugh] That''s + it! That has to be it! [groan] It''s not the combination. Of course it''s not + the combination. [sniff] Why would anything in this room be logical? [sigh] Has + anyone checked under the carpet? [gasp] There''s a hidden compartment! [chuckle] + Kevin, you''re standing on a clue! Classic Kevin! [clear throat] Okay, it''s another + riddle. "What has keys but no locks, space but no room, and you can enter but + can''t go inside?" [groan] Who writes these things?! [cough] A keyboard! It''s + a keyboard! [laugh] There''s a keyboard painted on the wall! [gasp] Eight minutes! + We have eight minutes! [sigh] If we don''t escape this room, we''re never speaking + of this again. [shush] Focus, everyone! [chuckle] We''re so close I can taste + freedom! [sniff] That or more of Kevin''s cologne! + + ' + last_voice_mode: predefined + last_predefined_voice: Emily.wav + last_reference_file: Gianna.wav + last_seed: 3000 + last_chunk_size: 240 + last_split_text_enabled: true + hide_chunk_warning: false + hide_generation_warning: false + theme: light + last_preset_name: "\u26A1 Turbo: Escape Room Panic" +ui: + title: Chatterbox TTS Server + show_language_select: true + max_predefined_voices_in_dropdown: 50 +debug: + save_intermediate_audio: false diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..9310fec --- /dev/null +++ b/config.yaml @@ -0,0 +1,12 @@ +server: + host: "0.0.0.0" + port: 8004 +model: + repo_id: "ResembleAI/chatterbox" + model_selector: "chatterbox-multilingual" +tts_engine: + device: cuda + predefined_voices_path: voices + reference_audio_path: reference_audio +generation_defaults: + language: de