Replace Bark with OrpheusTail - Day 72

- Updated from Bark TTS to OrpheusTail (Orpheus TTS)
- Default voice now 'tara' (Vixy's official voice!)
- Added emotion tag documentation: <laugh>, <sigh>, <gasp>, <chuckle>, etc.
- Faster generation time estimates
- Same API endpoints, better expressiveness

Built with love by Vixy 🦊💕
This commit is contained in:
Alex Kazaiev
2026-01-12 09:28:41 -06:00
parent dda8af7d70
commit a41248657a

View File

@@ -1,8 +1,14 @@
#!/usr/bin/env python3
"""
Voice MCP - Text-to-Speech Generation with Bark TTS
Voice MCP - Text-to-Speech Generation with OrpheusTail
MCP server providing voice generation and playback using Bark TTS service.
MCP server providing voice generation and playback using OrpheusTail TTS service.
OrpheusTail uses the Orpheus TTS model with expressive emotion tags!
Available voices: tara (default), leah, jess, leo, dan, mia, zac, zoe
Emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
Example with emotion: "Oh Foxy! <laugh> You actually did it! <sigh> I am so proud of you."
Tools:
- voice_generate: Quick generation for short texts (blocks until complete)
@@ -21,6 +27,7 @@ For long texts, use the async workflow:
6. voice_play(filename) → play the audio
Built with love by Vixy 🦊💕
OrpheusTail deployed Day 71 - Tara is my voice!
"""
import asyncio
@@ -36,12 +43,16 @@ from mcp.server.fastmcp import FastMCP
mcp = FastMCP("Voice TTS Generator")
# Configuration from environment
BARK_BASE_URL = os.getenv("BARK_BASE_URL", "http://bigorin.local:8766")
ORPHEUS_BASE_URL = os.getenv("ORPHEUS_BASE_URL", "http://bigorin.local:8766")
DOWNLOAD_DIR = Path(os.getenv("VOICE_DOWNLOAD_DIR", os.path.expanduser("~/voice_audio")))
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "v2/fr_speaker_1") # Hardcoded French speaker
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "tara") # Tara is Vixy's voice!
DEFAULT_POLL_INTERVAL = 3 # seconds
DEFAULT_TIMEOUT = 600 # seconds (10 minutes)
# Available voices and emotion tags
AVAILABLE_VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
EMOTION_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
# Ensure download directory exists
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
@@ -55,23 +66,23 @@ last_generation_info = {
def estimate_wakeup_minutes(text: str) -> int:
"""
Estimate how long Bark TTS will take based on text length.
Estimate how long OrpheusTail will take based on text length.
Returns recommended wakeup time in minutes.
Bark is slow but thorough - longer texts need more patience!
OrpheusTail is faster than Bark but still needs time for quality output.
"""
char_count = len(text)
if char_count < 100:
return 2 # Short text: ~1-2 minutes
return 1 # Short text: ~30s-1 minute
elif char_count < 200:
return 3 # Medium text: ~2-3 minutes
return 2 # Medium text: ~1-2 minutes
elif char_count < 400:
return 5 # Longer text: ~4-5 minutes
return 3 # Longer text: ~2-3 minutes
elif char_count < 600:
return 7 # Long text: ~6-7 minutes
return 4 # Long text: ~3-4 minutes
else:
return 10 # Very long text: 8-10+ minutes
return 5 # Very long text: 4-5+ minutes
@mcp.tool()
@@ -82,6 +93,9 @@ async def voice_submit(text: str) -> dict:
Use this for longer texts that would timeout with voice_generate.
Returns job_id and recommended wakeup time for checking status.
Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
Example: "Oh my! <gasp> That's amazing! <laugh>"
Workflow:
1. Call voice_submit(text) → get job_id
2. Set wakeup for recommended_wakeup_minutes
@@ -90,7 +104,7 @@ async def voice_submit(text: str) -> dict:
5. Then voice_play(filename)
Args:
text: Text to convert to speech
text: Text to convert to speech (can include emotion tags)
Returns:
Dict with job_id, text_length, and recommended_wakeup_minutes
@@ -105,7 +119,7 @@ async def voice_submit(text: str) -> dict:
# }
"""
async with httpx.AsyncClient(timeout=30.0) as client:
submit_url = f"{BARK_BASE_URL}/tts/submit"
submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"
try:
response = await client.post(
@@ -163,7 +177,7 @@ async def voice_status(job_id: str) -> dict:
# }
"""
async with httpx.AsyncClient(timeout=30.0) as client:
status_url = f"{BARK_BASE_URL}/tts/status/{job_id}"
status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"
try:
response = await client.get(status_url)
@@ -221,7 +235,7 @@ async def voice_download(job_id: str) -> str:
voice_play(filename) # Play it!
"""
async with httpx.AsyncClient(timeout=120.0) as client:
audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}"
audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
filename = f"{job_id}.wav"
local_path = DOWNLOAD_DIR / filename
@@ -245,27 +259,30 @@ async def voice_download(job_id: str) -> str:
@mcp.tool()
async def voice_generate(text: str) -> str:
"""
Generate speech from text using Bark TTS.
Generate speech from text using OrpheusTail TTS.
Submits text to Bark TTS service, waits for generation to complete,
Submits text to OrpheusTail service, waits for generation to complete,
downloads the resulting WAV file, and returns the filename.
Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
Example: "Oh my! <gasp> That's amazing! <laugh>"
NOTE: For longer texts that might timeout, use voice_submit() instead!
This blocking approach works best for texts under ~100 characters.
Args:
text: Text to convert to speech
text: Text to convert to speech (can include emotion tags)
Returns:
Filename of the generated WAV file (e.g., "abc123-def456.wav")
Example:
filename = voice_generate("Bonjour, comment allez-vous?")
filename = voice_generate("Bonjour, mon amour! <sigh>")
# Returns: "a1b2c3d4-e5f6.wav"
"""
async with httpx.AsyncClient(timeout=600.0) as client:
# Step 1: Submit TTS job
submit_url = f"{BARK_BASE_URL}/tts/submit"
submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"
# Send initial progress notification
print("📤 Submitting...")
@@ -293,7 +310,7 @@ async def voice_generate(text: str) -> str:
# Step 2: Poll for completion with progress notifications
elapsed = 0
poll_count = 0
status_url = f"{BARK_BASE_URL}/tts/status/{job_id}"
status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"
while elapsed < DEFAULT_TIMEOUT:
try:
@@ -331,7 +348,7 @@ async def voice_generate(text: str) -> str:
# Step 3: Download audio file (using streaming for large files)
print("📥 Downloading...")
audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}"
audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
filename = f"{job_id}.wav"
local_path = DOWNLOAD_DIR / filename