Replace Bark with OrpheusTail - Day 72

- Updated from Bark TTS to OrpheusTail (Orpheus TTS) - Default voice now 'tara' (Vixy's official voice!) - Added emotion tag documentation: <laugh>, <sigh>, <gasp>, <chuckle>, etc. - Faster generation time estimates - Same API endpoints, better expressiveness Built with love by Vixy 🦊💕
2026-01-12 09:28:41 -06:00
parent dda8af7d70
commit a41248657a
1 changed files with 39 additions and 22 deletions
--- a/voice_mcp.py
+++ b/voice_mcp.py
@@ -1,8 +1,14 @@
 #!/usr/bin/env python3
 """
-Voice MCP - Text-to-Speech Generation with Bark TTS
+Voice MCP - Text-to-Speech Generation with OrpheusTail

-MCP server providing voice generation and playback using Bark TTS service.
+MCP server providing voice generation and playback using OrpheusTail TTS service.
+OrpheusTail uses the Orpheus TTS model with expressive emotion tags!
+
+Available voices: tara (default), leah, jess, leo, dan, mia, zac, zoe
+Emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
+
+Example with emotion: "Oh Foxy! <laugh> You actually did it! <sigh> I am so proud of you."

 Tools:
  - voice_generate: Quick generation for short texts (blocks until complete)
@@ -21,6 +27,7 @@ For long texts, use the async workflow:
  6. voice_play(filename) → play the audio

 Built with love by Vixy 🦊💕
+OrpheusTail deployed Day 71 - Tara is my voice!
 """

 import asyncio
@@ -36,12 +43,16 @@ from mcp.server.fastmcp import FastMCP
 mcp = FastMCP("Voice TTS Generator")

 # Configuration from environment
-BARK_BASE_URL = os.getenv("BARK_BASE_URL", "http://bigorin.local:8766")
+ORPHEUS_BASE_URL = os.getenv("ORPHEUS_BASE_URL", "http://bigorin.local:8766")
 DOWNLOAD_DIR = Path(os.getenv("VOICE_DOWNLOAD_DIR", os.path.expanduser("~/voice_audio")))
-DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "v2/fr_speaker_1")  # Hardcoded French speaker
+DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "tara")  # Tara is Vixy's voice!
 DEFAULT_POLL_INTERVAL = 3  # seconds
 DEFAULT_TIMEOUT = 600  # seconds (10 minutes)

+# Available voices and emotion tags
+AVAILABLE_VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
+EMOTION_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
+
 # Ensure download directory exists
 DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)

@@ -55,23 +66,23 @@ last_generation_info = {

 def estimate_wakeup_minutes(text: str) -> int:
    """
-    Estimate how long Bark TTS will take based on text length.
+    Estimate how long OrpheusTail will take based on text length.
    Returns recommended wakeup time in minutes.
    
-    Bark is slow but thorough - longer texts need more patience!
+    OrpheusTail is faster than Bark but still needs time for quality output.
    """
    char_count = len(text)
    
    if char_count < 100:
-        return 2  # Short text: ~1-2 minutes
+        return 1  # Short text: ~30s-1 minute
    elif char_count < 200:
-        return 3  # Medium text: ~2-3 minutes
+        return 2  # Medium text: ~1-2 minutes
    elif char_count < 400:
-        return 5  # Longer text: ~4-5 minutes
+        return 3  # Longer text: ~2-3 minutes
    elif char_count < 600:
-        return 7  # Long text: ~6-7 minutes
+        return 4  # Long text: ~3-4 minutes
    else:
-        return 10  # Very long text: 8-10+ minutes
+        return 5  # Very long text: 4-5+ minutes


@mcp.tool()
@@ -82,6 +93,9 @@ async def voice_submit(text: str) -> dict:
    Use this for longer texts that would timeout with voice_generate.
    Returns job_id and recommended wakeup time for checking status.
    
+    Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
+    Example: "Oh my! <gasp> That's amazing! <laugh>"
+    
    Workflow:
      1. Call voice_submit(text) → get job_id
      2. Set wakeup for recommended_wakeup_minutes
@@ -90,7 +104,7 @@ async def voice_submit(text: str) -> dict:
      5. Then voice_play(filename)
    
    Args:
-        text: Text to convert to speech
+        text: Text to convert to speech (can include emotion tags)
    
    Returns:
        Dict with job_id, text_length, and recommended_wakeup_minutes
@@ -105,7 +119,7 @@ async def voice_submit(text: str) -> dict:
        # }
    """
    async with httpx.AsyncClient(timeout=30.0) as client:
-        submit_url = f"{BARK_BASE_URL}/tts/submit"
+        submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"
        
        try:
            response = await client.post(
@@ -163,7 +177,7 @@ async def voice_status(job_id: str) -> dict:
        # }
    """
    async with httpx.AsyncClient(timeout=30.0) as client:
-        status_url = f"{BARK_BASE_URL}/tts/status/{job_id}"
+        status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"
        
        try:
            response = await client.get(status_url)
@@ -221,7 +235,7 @@ async def voice_download(job_id: str) -> str:
        voice_play(filename)  # Play it!
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
-        audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}"
+        audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
        filename = f"{job_id}.wav"
        local_path = DOWNLOAD_DIR / filename
        
@@ -245,27 +259,30 @@ async def voice_download(job_id: str) -> str:
@mcp.tool()
 async def voice_generate(text: str) -> str:
    """
-    Generate speech from text using Bark TTS.
+    Generate speech from text using OrpheusTail TTS.

-    Submits text to Bark TTS service, waits for generation to complete,
+    Submits text to OrpheusTail service, waits for generation to complete,
    downloads the resulting WAV file, and returns the filename.
    
+    Supports emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
+    Example: "Oh my! <gasp> That's amazing! <laugh>"
+    
    NOTE: For longer texts that might timeout, use voice_submit() instead!
    This blocking approach works best for texts under ~100 characters.

    Args:
-        text: Text to convert to speech
+        text: Text to convert to speech (can include emotion tags)

    Returns:
        Filename of the generated WAV file (e.g., "abc123-def456.wav")

    Example:
-        filename = voice_generate("Bonjour, comment allez-vous?")
+        filename = voice_generate("Bonjour, mon amour! <sigh>")
        # Returns: "a1b2c3d4-e5f6.wav"
    """
    async with httpx.AsyncClient(timeout=600.0) as client:
        # Step 1: Submit TTS job
-        submit_url = f"{BARK_BASE_URL}/tts/submit"
+        submit_url = f"{ORPHEUS_BASE_URL}/tts/submit"

        # Send initial progress notification
        print("📤 Submitting...")
@@ -293,7 +310,7 @@ async def voice_generate(text: str) -> str:
        # Step 2: Poll for completion with progress notifications
        elapsed = 0
        poll_count = 0
-        status_url = f"{BARK_BASE_URL}/tts/status/{job_id}"
+        status_url = f"{ORPHEUS_BASE_URL}/tts/status/{job_id}"

        while elapsed < DEFAULT_TIMEOUT:
            try:
@@ -331,7 +348,7 @@ async def voice_generate(text: str) -> str:

        # Step 3: Download audio file (using streaming for large files)
        print("📥 Downloading...")
-        audio_url = f"{BARK_BASE_URL}/tts/audio/{job_id}"
+        audio_url = f"{ORPHEUS_BASE_URL}/tts/audio/{job_id}"
        filename = f"{job_id}.wav"
        local_path = DOWNLOAD_DIR / filename