diff --git a/voice_mcp.py b/voice_mcp.py index e6b9f17..7c31629 100755 --- a/voice_mcp.py +++ b/voice_mcp.py @@ -1,8 +1,14 @@ #!/usr/bin/env python3 """ -Voice MCP - Text-to-Speech Generation with Bark TTS +Voice MCP - Text-to-Speech Generation with OrpheusTail -MCP server providing voice generation and playback using Bark TTS service. +MCP server providing voice generation and playback using OrpheusTail TTS service. +OrpheusTail uses the Orpheus TTS model with expressive emotion tags! + +Available voices: tara (default), leah, jess, leo, dan, mia, zac, zoe +Emotion tags: , , , , , , , + +Example with emotion: "Oh Foxy! You actually did it! I am so proud of you." Tools: - voice_generate: Quick generation for short texts (blocks until complete) @@ -21,6 +27,7 @@ For long texts, use the async workflow: 6. voice_play(filename) → play the audio Built with love by Vixy 🦊💕 +OrpheusTail deployed Day 71 - Tara is my voice! """ import asyncio @@ -36,12 +43,16 @@ from mcp.server.fastmcp import FastMCP mcp = FastMCP("Voice TTS Generator") # Configuration from environment -BARK_BASE_URL = os.getenv("BARK_BASE_URL", "http://bigorin.local:8766") +ORPHEUS_BASE_URL = os.getenv("ORPHEUS_BASE_URL", "http://bigorin.local:8766") DOWNLOAD_DIR = Path(os.getenv("VOICE_DOWNLOAD_DIR", os.path.expanduser("~/voice_audio"))) -DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "v2/fr_speaker_1") # Hardcoded French speaker +DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "tara") # Tara is Vixy's voice! DEFAULT_POLL_INTERVAL = 3 # seconds DEFAULT_TIMEOUT = 600 # seconds (10 minutes) +# Available voices and emotion tags +AVAILABLE_VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"] +EMOTION_TAGS = ["", "", "", "", "", "", "", ""] + # Ensure download directory exists DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) @@ -55,23 +66,23 @@ last_generation_info = { def estimate_wakeup_minutes(text: str) -> int: """ - Estimate how long Bark TTS will take based on text length. + Estimate how long OrpheusTail will take based on text length. Returns recommended wakeup time in minutes. - Bark is slow but thorough - longer texts need more patience! + OrpheusTail is faster than Bark but still needs time for quality output. """ char_count = len(text) if char_count < 100: - return 2 # Short text: ~1-2 minutes + return 1 # Short text: ~30s-1 minute elif char_count < 200: - return 3 # Medium text: ~2-3 minutes + return 2 # Medium text: ~1-2 minutes elif char_count < 400: - return 5 # Longer text: ~4-5 minutes + return 3 # Longer text: ~2-3 minutes elif char_count < 600: - return 7 # Long text: ~6-7 minutes + return 4 # Long text: ~3-4 minutes else: - return 10 # Very long text: 8-10+ minutes + return 5 # Very long text: 4-5+ minutes @mcp.tool() @@ -82,6 +93,9 @@ async def voice_submit(text: str) -> dict: Use this for longer texts that would timeout with voice_generate. Returns job_id and recommended wakeup time for checking status. + Supports emotion tags: , , , , , , , + Example: "Oh my! That's amazing! " + Workflow: 1. Call voice_submit(text) → get job_id 2. Set wakeup for recommended_wakeup_minutes @@ -90,7 +104,7 @@ async def voice_submit(text: str) -> dict: 5. Then voice_play(filename) Args: - text: Text to convert to speech + text: Text to convert to speech (can include emotion tags) Returns: Dict with job_id, text_length, and recommended_wakeup_minutes @@ -105,7 +119,7 @@ async def voice_submit(text: str) -> dict: # } """ async with httpx.AsyncClient(timeout=30.0) as client: - submit_url = f"{BARK_BASE_URL}/tts/submit" + submit_url = f"{ORPHEUS_BASE_URL}/tts/submit" try: response = await client.post( @@ -163,7 +177,7 @@ async def voice_status(job_id: str) -> dict: # } """ async with httpx.AsyncClient(timeout=30.0) as client: - status_url = f"{BARK_BASE_URL}/tts/status/{job_id}" + status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}" try: response = await client.get(status_url) @@ -221,7 +235,7 @@ async def voice_download(job_id: str) -> str: voice_play(filename) # Play it! """ async with httpx.AsyncClient(timeout=120.0) as client: - audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}" + audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}" filename = f"{job_id}.wav" local_path = DOWNLOAD_DIR / filename @@ -245,27 +259,30 @@ async def voice_download(job_id: str) -> str: @mcp.tool() async def voice_generate(text: str) -> str: """ - Generate speech from text using Bark TTS. + Generate speech from text using OrpheusTail TTS. - Submits text to Bark TTS service, waits for generation to complete, + Submits text to OrpheusTail service, waits for generation to complete, downloads the resulting WAV file, and returns the filename. + Supports emotion tags: , , , , , , , + Example: "Oh my! That's amazing! " + NOTE: For longer texts that might timeout, use voice_submit() instead! This blocking approach works best for texts under ~100 characters. Args: - text: Text to convert to speech + text: Text to convert to speech (can include emotion tags) Returns: Filename of the generated WAV file (e.g., "abc123-def456.wav") Example: - filename = voice_generate("Bonjour, comment allez-vous?") + filename = voice_generate("Bonjour, mon amour! ") # Returns: "a1b2c3d4-e5f6.wav" """ async with httpx.AsyncClient(timeout=600.0) as client: # Step 1: Submit TTS job - submit_url = f"{BARK_BASE_URL}/tts/submit" + submit_url = f"{ORPHEUS_BASE_URL}/tts/submit" # Send initial progress notification print("📤 Submitting...") @@ -293,7 +310,7 @@ async def voice_generate(text: str) -> str: # Step 2: Poll for completion with progress notifications elapsed = 0 poll_count = 0 - status_url = f"{BARK_BASE_URL}/tts/status/{job_id}" + status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}" while elapsed < DEFAULT_TIMEOUT: try: @@ -331,7 +348,7 @@ async def voice_generate(text: str) -> str: # Step 3: Download audio file (using streaming for large files) print("📥 Downloading...") - audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}" + audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}" filename = f"{job_id}.wav" local_path = DOWNLOAD_DIR / filename