feat: TTS Stack - Chatterbox Multilingual + Whisper STT
- Chatterbox TTS Server (Multilingual, 23 Sprachen, Voice Cloning) - Whisper STT Server (faster-whisper-small, CPU) - RTX 4060 GPU auf Tdarr VM (node2, 10.2.1.104) - Voice Profile: chantal.wav - Chantal Telegram Voice Integration
This commit is contained in:
commit
6115f3bc09
4 changed files with 167 additions and 0 deletions
50
compose-chatterbox.yaml
Normal file
50
compose-chatterbox.yaml
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
services:
|
||||||
|
chatterbox-tts-server:
|
||||||
|
build:
|
||||||
|
args:
|
||||||
|
# Can be nvidia or cpu; Default is Nvidia
|
||||||
|
- RUNTIME=nvidia
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ports:
|
||||||
|
- "${PORT:-8004}:8004"
|
||||||
|
volumes:
|
||||||
|
# Mount local config file for persistence
|
||||||
|
- ./config.yaml:/app/config.yaml
|
||||||
|
# Mount local directories for persistent app data
|
||||||
|
- ./voices:/app/voices
|
||||||
|
- ./reference_audio:/app/reference_audio
|
||||||
|
- ./outputs:/app/outputs
|
||||||
|
- ./logs:/app/logs
|
||||||
|
# Named volume for Hugging Face model cache to persist across container rebuilds
|
||||||
|
- hf_cache:/app/hf_cache
|
||||||
|
|
||||||
|
# --- GPU Support (NVIDIA) ---
|
||||||
|
# The 'deploy' key is the modern way to request GPU resources.
|
||||||
|
# If you get a 'CDI device injection failed' error, comment out the 'deploy' section
|
||||||
|
# and uncomment the 'runtime: nvidia' line below.
|
||||||
|
|
||||||
|
# Method 1: Modern Docker Compose (Recommended)
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: 1
|
||||||
|
capabilities: [gpu]
|
||||||
|
|
||||||
|
# Method 2: Legacy Docker Compose (for older setups)
|
||||||
|
# runtime: nvidia
|
||||||
|
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
- HF_TOKEN=YOUR_TOKEN_HERE
|
||||||
|
# Enable faster Hugging Face downloads inside the container
|
||||||
|
- HF_HUB_ENABLE_HF_TRANSFER=1
|
||||||
|
# Make NVIDIA GPUs visible and specify capabilities for PyTorch
|
||||||
|
- NVIDIA_VISIBLE_DEVICES=all
|
||||||
|
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||||
|
|
||||||
|
# Define the named volume for the Hugging Face cache
|
||||||
|
volumes:
|
||||||
|
hf_cache:
|
||||||
37
compose.yaml
Normal file
37
compose.yaml
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
services:
|
||||||
|
chatterbox-tts:
|
||||||
|
build:
|
||||||
|
context: https://github.com/devnen/Chatterbox-TTS-Server.git
|
||||||
|
container_name: chatterbox-tts
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8004:8004"
|
||||||
|
volumes:
|
||||||
|
- ./config.yaml:/app/config.yaml
|
||||||
|
- ./voices:/app/voices
|
||||||
|
- ./reference_audio:/app/reference_audio
|
||||||
|
- ./outputs:/app/outputs
|
||||||
|
- hf_cache:/app/hf_cache
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
|
||||||
|
whisper:
|
||||||
|
image: fedirz/faster-whisper-server:latest-cpu
|
||||||
|
container_name: whisper
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8005:8000"
|
||||||
|
volumes:
|
||||||
|
- whisper-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- WHISPER__MODEL=Systran/faster-whisper-small
|
||||||
|
- WHISPER__DEVICE=cpu
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
hf_cache:
|
||||||
|
whisper-cache:
|
||||||
68
config-chatterbox.yaml
Normal file
68
config-chatterbox.yaml
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
server:
|
||||||
|
host: 0.0.0.0
|
||||||
|
port: 8004
|
||||||
|
use_ngrok: false
|
||||||
|
use_auth: false
|
||||||
|
auth_username: user
|
||||||
|
auth_password: password
|
||||||
|
log_file_path: logs/tts_server.log
|
||||||
|
log_file_max_size_mb: 10
|
||||||
|
log_file_backup_count: 5
|
||||||
|
model:
|
||||||
|
repo_id: chatterbox-multilingual
|
||||||
|
tts_engine:
|
||||||
|
device: cuda
|
||||||
|
predefined_voices_path: voices
|
||||||
|
reference_audio_path: reference_audio
|
||||||
|
default_voice_id: Emily.wav
|
||||||
|
paths:
|
||||||
|
model_cache: model_cache
|
||||||
|
output: outputs
|
||||||
|
generation_defaults:
|
||||||
|
temperature: 0.8
|
||||||
|
exaggeration: 1.3
|
||||||
|
cfg_weight: 0.5
|
||||||
|
seed: 0
|
||||||
|
speed_factor: 1.0
|
||||||
|
language: en
|
||||||
|
audio_output:
|
||||||
|
format: wav
|
||||||
|
sample_rate: 24000
|
||||||
|
max_reference_duration_sec: 30
|
||||||
|
save_to_disk: false
|
||||||
|
ui_state:
|
||||||
|
last_text: '[gasp] Okay, we have fifteen minutes left! Fifteen minutes! [groan]
|
||||||
|
How have we been in here for forty-five minutes and only solved two puzzles?!
|
||||||
|
[clear throat] Let''s think logically. [sniff] This room smells like fear and
|
||||||
|
bad decisions. [cough] And also someone''s very strong cologne, Kevin. [sigh]
|
||||||
|
Okay, the clue says "the answer lies where time stands still." [chuckle] Great,
|
||||||
|
very helpful, very straightforward. [shush] Everyone stop talking for one second!
|
||||||
|
[gasp] Wait, the clock on the wall! It''s stuck at three fifteen! [laugh] That''s
|
||||||
|
it! That has to be it! [groan] It''s not the combination. Of course it''s not
|
||||||
|
the combination. [sniff] Why would anything in this room be logical? [sigh] Has
|
||||||
|
anyone checked under the carpet? [gasp] There''s a hidden compartment! [chuckle]
|
||||||
|
Kevin, you''re standing on a clue! Classic Kevin! [clear throat] Okay, it''s another
|
||||||
|
riddle. "What has keys but no locks, space but no room, and you can enter but
|
||||||
|
can''t go inside?" [groan] Who writes these things?! [cough] A keyboard! It''s
|
||||||
|
a keyboard! [laugh] There''s a keyboard painted on the wall! [gasp] Eight minutes!
|
||||||
|
We have eight minutes! [sigh] If we don''t escape this room, we''re never speaking
|
||||||
|
of this again. [shush] Focus, everyone! [chuckle] We''re so close I can taste
|
||||||
|
freedom! [sniff] That or more of Kevin''s cologne!
|
||||||
|
|
||||||
|
'
|
||||||
|
last_voice_mode: predefined
|
||||||
|
last_predefined_voice: Emily.wav
|
||||||
|
last_reference_file: Gianna.wav
|
||||||
|
last_seed: 3000
|
||||||
|
last_chunk_size: 240
|
||||||
|
last_split_text_enabled: true
|
||||||
|
hide_chunk_warning: false
|
||||||
|
hide_generation_warning: false
|
||||||
|
theme: light
|
||||||
|
last_preset_name: "\u26A1 Turbo: Escape Room Panic"
|
||||||
|
ui:
|
||||||
|
title: Chatterbox TTS Server
|
||||||
|
show_language_select: true
|
||||||
|
max_predefined_voices_in_dropdown: 50
|
||||||
|
debug:
|
||||||
|
save_intermediate_audio: false
|
||||||
12
config.yaml
Normal file
12
config.yaml
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
server:
|
||||||
|
host: "0.0.0.0"
|
||||||
|
port: 8004
|
||||||
|
model:
|
||||||
|
repo_id: "ResembleAI/chatterbox"
|
||||||
|
model_selector: "chatterbox-multilingual"
|
||||||
|
tts_engine:
|
||||||
|
device: cuda
|
||||||
|
predefined_voices_path: voices
|
||||||
|
reference_audio_path: reference_audio
|
||||||
|
generation_defaults:
|
||||||
|
language: de
|
||||||
Loading…
Add table
Add a link
Reference in a new issue