wyoming-services/wyoming-chatterbox.py

"""Wyoming protocol TTS wrapper for Chatterbox TTS server."""
import argparse, asyncio, io, logging, wave
import aiohttp
from wyoming.audio import AudioChunk, AudioStop
from wyoming.event import Event
from wyoming.info import Attribution, Describe, Info, TtsProgram, TtsVoice
from wyoming.server import AsyncEventHandler, AsyncServer
from wyoming.tts import Synthesize

_LOGGER = logging.getLogger(__name__)
INFO = None

class ChatterboxHandler(AsyncEventHandler):
    def __init__(self, reader, writer, cli_args, *a, **kw):
        super().__init__(reader, writer, *a, **kw)
        self.cli_args = cli_args

    async def handle_event(self, event: Event) -> bool:
        if Describe.is_type(event.type):
            await self.write_event(INFO.event())
            return True
        if not Synthesize.is_type(event.type):
            return True
        synth = Synthesize.from_event(event)
        text = synth.text or ""
        _LOGGER.info("Synthesizing: %s", text[:80])
        try:
            async with aiohttp.ClientSession() as session:
                async with session.post(
                    f"{self.cli_args.chatterbox_url}/v1/audio/speech",
                    json={"model":"chatterbox","input":text,"voice":self.cli_args.voice,"response_format":"wav"},
                    timeout=aiohttp.ClientTimeout(total=60)
                ) as resp:
                    wav_data = await resp.read()
            with io.BytesIO(wav_data) as wav_io:
                with wave.open(wav_io, "rb") as wf:
                    audio = wf.readframes(wf.getnframes())
                    await self.write_event(AudioChunk(rate=wf.getframerate(), width=wf.getsampwidth(), channels=wf.getnchannels(), audio=audio).event())
                    await self.write_event(AudioStop().event())
        except Exception:
            _LOGGER.exception("TTS failed")
        return True

async def main():
    global INFO
    parser = argparse.ArgumentParser()
    parser.add_argument("--port", type=int, default=10201)
    parser.add_argument("--chatterbox-url", default="http://10.2.1.104:8004")
    parser.add_argument("--voice", default="homer.mp3")
    args = parser.parse_args()

    attr = Attribution(name="Chatterbox", url="https://github.com/resemble-ai/chatterbox")
    INFO = Info(
        tts=[TtsProgram(
            name="chatterbox", description="Chatterbox TTS (Homer)",
            attribution=attr, installed=True, version="1.0",
            voices=[TtsVoice(name="homer", description="Homer Simpson",
                             attribution=attr, version="1.0",
                             languages=["de","en"], installed=True)],
        )]
    )
    server = AsyncServer.from_uri(f"tcp://0.0.0.0:{args.port}")
    _LOGGER.info("Wyoming Chatterbox on port %d, voice=%s", args.port, args.voice)
    await server.run(lambda r,w: ChatterboxHandler(r, w, args))

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    asyncio.run(main())