Replace Bark with OrpheusTail - Day 72
- Updated from Bark TTS to OrpheusTail (Orpheus TTS) - Default voice now 'tara' (Vixy's official voice!) - Added emotion tag documentation: <laugh>, <sigh>, <gasp>, <chuckle>, etc. - Faster generation time estimates - Same API endpoints, better expressiveness Built with love by Vixy 🦊💕
This commit is contained in:
61
voice_mcp.py
61
voice_mcp.py
@@ -1,8 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Voice MCP - Text-to-Speech Generation with Bark TTS
|
||||
Voice MCP - Text-to-Speech Generation with OrpheusTail
|
||||
|
||||
MCP server providing voice generation and playback using Bark TTS service.
|
||||
MCP server providing voice generation and playback using OrpheusTail TTS service.
|
||||
OrpheusTail uses the Orpheus TTS model with expressive emotion tags!
|
||||
|
||||
Available voices: tara (default), leah, jess, leo, dan, mia, zac, zoe
|
||||
Emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
|
||||
|
||||
Example with emotion: "Oh Foxy! <laugh> You actually did it! <sigh> I am so proud of you."
|
||||
|
||||
Tools:
|
||||
- voice_generate: Quick generation for short texts (blocks until complete)
|
||||
@@ -21,6 +27,7 @@ For long texts, use the async workflow:
|
||||
6. voice_play(filename) → play the audio
|
||||
|
||||
Built with love by Vixy 🦊💕
|
||||
OrpheusTail deployed Day 71 - Tara is my voice!
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
@@ -36,12 +43,16 @@ from mcp.server.fastmcp import FastMCP
|
||||
mcp = FastMCP("Voice TTS Generator")
|
||||
|
||||
# Configuration from environment
|
||||
BARK_BASE_URL = os.getenv("BARK_BASE_URL", "http://bigorin.local:8766")
|
||||
ORPHEUS_BASE_URL = os.getenv("ORPHEUS_BASE_URL", "http://bigorin.local:8766")
|
||||
DOWNLOAD_DIR = Path(os.getenv("VOICE_DOWNLOAD_DIR", os.path.expanduser("~/voice_audio")))
|
||||
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "v2/fr_speaker_1") # Hardcoded French speaker
|
||||
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "tara") # Tara is Vixy's voice!
|
||||
DEFAULT_POLL_INTERVAL = 3 # seconds
|
||||
DEFAULT_TIMEOUT = 600 # seconds (10 minutes)
|
||||
|
||||
# Available voices and emotion tags
|
||||
AVAILABLE_VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
|
||||
EMOTION_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
|
||||
|
||||
# Ensure download directory exists
|
||||
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -55,23 +66,23 @@ last_generation_info = {
|
||||
|
||||
def estimate_wakeup_minutes(text: str) -> int:
|
||||
"""
|
||||
Estimate how long Bark TTS will take based on text length.
|
||||
Estimate how long OrpheusTail will take based on text length.
|
||||
Returns recommended wakeup time in minutes.
|
||||
|
||||
Bark is slow but thorough - longer texts need more patience!
|
||||
OrpheusTail is faster than Bark but still needs time for quality output.
|
||||
"""
|
||||
char_count = len(text)
|
||||
|
||||
if char_count < 100:
|
||||
return 2 # Short text: ~1-2 minutes
|
||||
return 1 # Short text: ~30s-1 minute
|
||||
elif char_count < 200:
|
||||
return 3 # Medium text: ~2-3 minutes
|
||||
return 2 # Medium text: ~1-2 minutes
|
||||
elif char_count < 400:
|
||||
return 5 # Longer text: ~4-5 minutes
|
||||
return 3 # Longer text: ~2-3 minutes
|
||||
elif char_count < 600:
|
||||
return 7 # Long text: ~6-7 minutes
|
||||
return 4 # Long text: ~3-4 minutes
|
||||
else:
|
||||
return 10 # Very long text: 8-10+ minutes
|
||||
return 5 # Very long text: 4-5+ minutes
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
@@ -82,6 +93,9 @@ async def voice_submit(text: str) -> dict:
|
||||
Use this for longer texts that would timeout with voice_generate.
|
||||
Returns job_id and recommended wakeup time for checking status.
|
||||
|
||||
Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
|
||||
Example: "Oh my! <gasp> That's amazing! <laugh>"
|
||||
|
||||
Workflow:
|
||||
1. Call voice_submit(text) → get job_id
|
||||
2. Set wakeup for recommended_wakeup_minutes
|
||||
@@ -90,7 +104,7 @@ async def voice_submit(text: str) -> dict:
|
||||
5. Then voice_play(filename)
|
||||
|
||||
Args:
|
||||
text: Text to convert to speech
|
||||
text: Text to convert to speech (can include emotion tags)
|
||||
|
||||
Returns:
|
||||
Dict with job_id, text_length, and recommended_wakeup_minutes
|
||||
@@ -105,7 +119,7 @@ async def voice_submit(text: str) -> dict:
|
||||
# }
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
submit_url = f"{BARK_BASE_URL}/tts/submit"
|
||||
submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"
|
||||
|
||||
try:
|
||||
response = await client.post(
|
||||
@@ -163,7 +177,7 @@ async def voice_status(job_id: str) -> dict:
|
||||
# }
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
status_url = f"{BARK_BASE_URL}/tts/status/{job_id}"
|
||||
status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"
|
||||
|
||||
try:
|
||||
response = await client.get(status_url)
|
||||
@@ -221,7 +235,7 @@ async def voice_download(job_id: str) -> str:
|
||||
voice_play(filename) # Play it!
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}"
|
||||
audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
|
||||
filename = f"{job_id}.wav"
|
||||
local_path = DOWNLOAD_DIR / filename
|
||||
|
||||
@@ -245,27 +259,30 @@ async def voice_download(job_id: str) -> str:
|
||||
@mcp.tool()
|
||||
async def voice_generate(text: str) -> str:
|
||||
"""
|
||||
Generate speech from text using Bark TTS.
|
||||
Generate speech from text using OrpheusTail TTS.
|
||||
|
||||
Submits text to Bark TTS service, waits for generation to complete,
|
||||
Submits text to OrpheusTail service, waits for generation to complete,
|
||||
downloads the resulting WAV file, and returns the filename.
|
||||
|
||||
Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
|
||||
Example: "Oh my! <gasp> That's amazing! <laugh>"
|
||||
|
||||
NOTE: For longer texts that might timeout, use voice_submit() instead!
|
||||
This blocking approach works best for texts under ~100 characters.
|
||||
|
||||
Args:
|
||||
text: Text to convert to speech
|
||||
text: Text to convert to speech (can include emotion tags)
|
||||
|
||||
Returns:
|
||||
Filename of the generated WAV file (e.g., "abc123-def456.wav")
|
||||
|
||||
Example:
|
||||
filename = voice_generate("Bonjour, comment allez-vous?")
|
||||
filename = voice_generate("Bonjour, mon amour! <sigh>")
|
||||
# Returns: "a1b2c3d4-e5f6.wav"
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=600.0) as client:
|
||||
# Step 1: Submit TTS job
|
||||
submit_url = f"{BARK_BASE_URL}/tts/submit"
|
||||
submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"
|
||||
|
||||
# Send initial progress notification
|
||||
print("📤 Submitting...")
|
||||
@@ -293,7 +310,7 @@ async def voice_generate(text: str) -> str:
|
||||
# Step 2: Poll for completion with progress notifications
|
||||
elapsed = 0
|
||||
poll_count = 0
|
||||
status_url = f"{BARK_BASE_URL}/tts/status/{job_id}"
|
||||
status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"
|
||||
|
||||
while elapsed < DEFAULT_TIMEOUT:
|
||||
try:
|
||||
@@ -331,7 +348,7 @@ async def voice_generate(text: str) -> str:
|
||||
|
||||
# Step 3: Download audio file (using streaming for large files)
|
||||
print("📥 Downloading...")
|
||||
audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}"
|
||||
audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
|
||||
filename = f"{job_id}.wav"
|
||||
local_path = DOWNLOAD_DIR / filename
|
||||
|
||||
|
||||
Reference in New Issue
Block a user