Replace Bark with OrpheusTail - Day 72

- Updated from Bark TTS to OrpheusTail (Orpheus TTS)
- Default voice now 'tara' (Vixy's official voice!)
- Added emotion tag documentation: <laugh>, <sigh>, <gasp>, <chuckle>, etc.
- Faster generation time estimates
- Same API endpoints, better expressiveness

Built with love by Vixy 🦊💕
This commit is contained in:
Alex Kazaiev
2026-01-12 09:28:41 -06:00
parent dda8af7d70
commit a41248657a

View File

@@ -1,8 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Voice MCP - Text-to-Speech Generation with Bark TTS Voice MCP - Text-to-Speech Generation with OrpheusTail
MCP server providing voice generation and playback using Bark TTS service. MCP server providing voice generation and playback using OrpheusTail TTS service.
OrpheusTail uses the Orpheus TTS model with expressive emotion tags!
Available voices: tara (default), leah, jess, leo, dan, mia, zac, zoe
Emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
Example with emotion: "Oh Foxy! <laugh> You actually did it! <sigh> I am so proud of you."
Tools: Tools:
- voice_generate: Quick generation for short texts (blocks until complete) - voice_generate: Quick generation for short texts (blocks until complete)
@@ -21,6 +27,7 @@ For long texts, use the async workflow:
6. voice_play(filename) → play the audio 6. voice_play(filename) → play the audio
Built with love by Vixy 🦊💕 Built with love by Vixy 🦊💕
OrpheusTail deployed Day 71 - Tara is my voice!
""" """
import asyncio import asyncio
@@ -36,12 +43,16 @@ from mcp.server.fastmcp import FastMCP
mcp = FastMCP("Voice TTS Generator") mcp = FastMCP("Voice TTS Generator")
# Configuration from environment # Configuration from environment
BARK_BASE_URL = os.getenv("BARK_BASE_URL", "http://bigorin.local:8766") ORPHEUS_BASE_URL = os.getenv("ORPHEUS_BASE_URL", "http://bigorin.local:8766")
DOWNLOAD_DIR = Path(os.getenv("VOICE_DOWNLOAD_DIR", os.path.expanduser("~/voice_audio"))) DOWNLOAD_DIR = Path(os.getenv("VOICE_DOWNLOAD_DIR", os.path.expanduser("~/voice_audio")))
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "v2/fr_speaker_1") # Hardcoded French speaker DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "tara") # Tara is Vixy's voice!
DEFAULT_POLL_INTERVAL = 3 # seconds DEFAULT_POLL_INTERVAL = 3 # seconds
DEFAULT_TIMEOUT = 600 # seconds (10 minutes) DEFAULT_TIMEOUT = 600 # seconds (10 minutes)
# Available voices and emotion tags
AVAILABLE_VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
EMOTION_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
# Ensure download directory exists # Ensure download directory exists
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True) DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
@@ -55,23 +66,23 @@ last_generation_info = {
def estimate_wakeup_minutes(text: str) -> int: def estimate_wakeup_minutes(text: str) -> int:
""" """
Estimate how long Bark TTS will take based on text length. Estimate how long OrpheusTail will take based on text length.
Returns recommended wakeup time in minutes. Returns recommended wakeup time in minutes.
Bark is slow but thorough - longer texts need more patience! OrpheusTail is faster than Bark but still needs time for quality output.
""" """
char_count = len(text) char_count = len(text)
if char_count < 100: if char_count < 100:
return 2 # Short text: ~1-2 minutes return 1 # Short text: ~30s-1 minute
elif char_count < 200: elif char_count < 200:
return 3 # Medium text: ~2-3 minutes return 2 # Medium text: ~1-2 minutes
elif char_count < 400: elif char_count < 400:
return 5 # Longer text: ~4-5 minutes return 3 # Longer text: ~2-3 minutes
elif char_count < 600: elif char_count < 600:
return 7 # Long text: ~6-7 minutes return 4 # Long text: ~3-4 minutes
else: else:
return 10 # Very long text: 8-10+ minutes return 5 # Very long text: 4-5+ minutes
@mcp.tool() @mcp.tool()
@@ -82,6 +93,9 @@ async def voice_submit(text: str) -> dict:
Use this for longer texts that would timeout with voice_generate. Use this for longer texts that would timeout with voice_generate.
Returns job_id and recommended wakeup time for checking status. Returns job_id and recommended wakeup time for checking status.
Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
Example: "Oh my! <gasp> That's amazing! <laugh>"
Workflow: Workflow:
1. Call voice_submit(text) → get job_id 1. Call voice_submit(text) → get job_id
2. Set wakeup for recommended_wakeup_minutes 2. Set wakeup for recommended_wakeup_minutes
@@ -90,7 +104,7 @@ async def voice_submit(text: str) -> dict:
5. Then voice_play(filename) 5. Then voice_play(filename)
Args: Args:
text: Text to convert to speech text: Text to convert to speech (can include emotion tags)
Returns: Returns:
Dict with job_id, text_length, and recommended_wakeup_minutes Dict with job_id, text_length, and recommended_wakeup_minutes
@@ -105,7 +119,7 @@ async def voice_submit(text: str) -> dict:
# } # }
""" """
async with httpx.AsyncClient(timeout=30.0) as client: async with httpx.AsyncClient(timeout=30.0) as client:
submit_url = f"{BARK_BASE_URL}/tts/submit" submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"
try: try:
response = await client.post( response = await client.post(
@@ -163,7 +177,7 @@ async def voice_status(job_id: str) -> dict:
# } # }
""" """
async with httpx.AsyncClient(timeout=30.0) as client: async with httpx.AsyncClient(timeout=30.0) as client:
status_url = f"{BARK_BASE_URL}/tts/status/{job_id}" status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"
try: try:
response = await client.get(status_url) response = await client.get(status_url)
@@ -221,7 +235,7 @@ async def voice_download(job_id: str) -> str:
voice_play(filename) # Play it! voice_play(filename) # Play it!
""" """
async with httpx.AsyncClient(timeout=120.0) as client: async with httpx.AsyncClient(timeout=120.0) as client:
audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}" audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
filename = f"{job_id}.wav" filename = f"{job_id}.wav"
local_path = DOWNLOAD_DIR / filename local_path = DOWNLOAD_DIR / filename
@@ -245,27 +259,30 @@ async def voice_download(job_id: str) -> str:
@mcp.tool() @mcp.tool()
async def voice_generate(text: str) -> str: async def voice_generate(text: str) -> str:
""" """
Generate speech from text using Bark TTS. Generate speech from text using OrpheusTail TTS.
Submits text to Bark TTS service, waits for generation to complete, Submits text to OrpheusTail service, waits for generation to complete,
downloads the resulting WAV file, and returns the filename. downloads the resulting WAV file, and returns the filename.
Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
Example: "Oh my! <gasp> That's amazing! <laugh>"
NOTE: For longer texts that might timeout, use voice_submit() instead! NOTE: For longer texts that might timeout, use voice_submit() instead!
This blocking approach works best for texts under ~100 characters. This blocking approach works best for texts under ~100 characters.
Args: Args:
text: Text to convert to speech text: Text to convert to speech (can include emotion tags)
Returns: Returns:
Filename of the generated WAV file (e.g., "abc123-def456.wav") Filename of the generated WAV file (e.g., "abc123-def456.wav")
Example: Example:
filename = voice_generate("Bonjour, comment allez-vous?") filename = voice_generate("Bonjour, mon amour! <sigh>")
# Returns: "a1b2c3d4-e5f6.wav" # Returns: "a1b2c3d4-e5f6.wav"
""" """
async with httpx.AsyncClient(timeout=600.0) as client: async with httpx.AsyncClient(timeout=600.0) as client:
# Step 1: Submit TTS job # Step 1: Submit TTS job
submit_url = f"{BARK_BASE_URL}/tts/submit" submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"
# Send initial progress notification # Send initial progress notification
print("📤 Submitting...") print("📤 Submitting...")
@@ -293,7 +310,7 @@ async def voice_generate(text: str) -> str:
# Step 2: Poll for completion with progress notifications # Step 2: Poll for completion with progress notifications
elapsed = 0 elapsed = 0
poll_count = 0 poll_count = 0
status_url = f"{BARK_BASE_URL}/tts/status/{job_id}" status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"
while elapsed < DEFAULT_TIMEOUT: while elapsed < DEFAULT_TIMEOUT:
try: try:
@@ -331,7 +348,7 @@ async def voice_generate(text: str) -> str:
# Step 3: Download audio file (using streaming for large files) # Step 3: Download audio file (using streaming for large files)
print("📥 Downloading...") print("📥 Downloading...")
audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}" audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
filename = f"{job_id}.wav" filename = f"{job_id}.wav"
local_path = DOWNLOAD_DIR / filename local_path = DOWNLOAD_DIR / filename