Replaced sync vLLM LLM with AsyncLLMEngine for real streaming. Tokens now flow incrementally: vLLM → async_generate_tokens → orpheus_tts tokens_decoder → audio chunks → StreamingResponse. First audio chunk arrives after ~28 tokens (SNAC codec warmup) instead of waiting for all ~2000+ tokens to complete. Expected: first-byte latency drops from ~15s to ~1-2s. Background jobs (submit/async) still work via sync wrapper that collects all tokens from the async engine. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
742 lines
23 KiB
Python
742 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
OrpheusTail - Orpheus TTS Service
|
|
|
|
FastAPI server for Orpheus text-to-speech generation on Jetson AGX Orin.
|
|
Replaces VoiceTail (Bark) with better control, voice cloning, and emotion tags.
|
|
|
|
Key Features:
|
|
- Emotion tags: <laugh>, <chuckle>, <sigh>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>
|
|
- Zero-shot voice cloning from reference audio
|
|
- Streaming support for real-time head playback
|
|
- Built-in voices: tara, leah, jess, leo, dan, mia, zac, zoe
|
|
|
|
Endpoints:
|
|
- POST /tts/submit - Submit TTS job (returns job_id)
|
|
- GET /tts/status/{job_id} - Check job status
|
|
- GET /tts/audio/{job_id} - Download generated audio
|
|
- POST /tts/stream - Stream audio in real-time (for head)
|
|
- POST /voice/clone - Upload reference audio for voice cloning
|
|
- GET /voices - List available voices
|
|
- GET /health - Health check
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import hashlib
|
|
import asyncio
|
|
import uuid
|
|
import wave
|
|
import io
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
from dataclasses import dataclass, asdict
|
|
from enum import Enum
|
|
|
|
from fastapi import FastAPI, BackgroundTasks, HTTPException, UploadFile, File
|
|
from fastapi.responses import FileResponse, StreamingResponse
|
|
from pydantic import BaseModel
|
|
|
|
# Configuration from environment
|
|
ORPHEUS_MODEL = os.getenv("ORPHEUS_MODEL", "canopylabs/orpheus-tts-0.1-finetune-prod")
|
|
CACHE_ENABLED = os.getenv("CACHE_ENABLED", "true").lower() == "true"
|
|
CACHE_DIR = Path(os.getenv("CACHE_DIR", "cache"))
|
|
OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR", "output"))
|
|
VOICES_DIR = Path(os.getenv("VOICES_DIR", "voices")) # For cloned voice references
|
|
RETENTION_DAYS = int(os.getenv("RETENTION_DAYS", "10"))
|
|
CLEANUP_INTERVAL_HOURS = int(os.getenv("CLEANUP_INTERVAL_HOURS", "1"))
|
|
DEFAULT_VOICE = os.getenv("DEFAULT_VOICE", "tara") # Orpheus default voice
|
|
MAX_MODEL_LEN = int(os.getenv("MAX_MODEL_LEN", "8192"))
|
|
MAX_TOKENS = int(os.getenv("MAX_TOKENS", "8000"))
|
|
SAMPLE_RATE = 24000
|
|
|
|
# Ensure directories exist
|
|
CACHE_DIR.mkdir(exist_ok=True)
|
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
VOICES_DIR.mkdir(exist_ok=True)
|
|
|
|
# Jobs persistence
|
|
JOBS_FILE = OUTPUT_DIR / "jobs.json"
|
|
|
|
# Built-in Orpheus voices (in order of conversational realism per docs)
|
|
BUILTIN_VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
|
|
|
|
# Supported emotion tags
|
|
EMOTION_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
|
|
|
|
# Initialize FastAPI
|
|
app = FastAPI(
|
|
title="OrpheusTail - Orpheus TTS Service",
|
|
description="Text-to-speech with emotion control and voice cloning for Vixy",
|
|
version="1.0.0"
|
|
)
|
|
|
|
# Global model (loaded at startup)
|
|
model = None
|
|
|
|
|
|
class JobStatus(str, Enum):
|
|
"""Job status enum"""
|
|
PENDING = "PENDING"
|
|
PROCESSING = "PROCESSING"
|
|
SUCCESS = "SUCCESS"
|
|
FAILURE = "FAILURE"
|
|
|
|
|
|
@dataclass
|
|
class JobInfo:
|
|
"""Job information"""
|
|
job_id: str
|
|
text: str
|
|
voice: str
|
|
status: JobStatus
|
|
progress: int = 0
|
|
audio_path: Optional[str] = None
|
|
error: Optional[str] = None
|
|
cached: bool = False
|
|
created_at: str = ""
|
|
completed_at: Optional[str] = None
|
|
|
|
|
|
# In-memory job storage
|
|
jobs: Dict[str, JobInfo] = {}
|
|
|
|
|
|
def load_jobs_from_disk():
|
|
"""Load jobs from disk on startup"""
|
|
global jobs
|
|
if JOBS_FILE.exists():
|
|
try:
|
|
with open(JOBS_FILE, 'r') as f:
|
|
data = json.load(f)
|
|
for job_id, job_dict in data.items():
|
|
jobs[job_id] = JobInfo(**job_dict)
|
|
print(f"Loaded {len(jobs)} jobs from disk")
|
|
except Exception as e:
|
|
print(f"Error loading jobs: {e}")
|
|
|
|
|
|
def save_jobs_to_disk():
|
|
"""Save jobs to disk"""
|
|
try:
|
|
data = {job_id: asdict(job) for job_id, job in jobs.items()}
|
|
with open(JOBS_FILE, 'w') as f:
|
|
json.dump(data, f, indent=2)
|
|
except Exception as e:
|
|
print(f"Error saving jobs: {e}")
|
|
|
|
|
|
def hash_text_voice(text: str, voice: str) -> str:
|
|
"""Generate cache key from text + voice"""
|
|
content = f"{text}|{voice}"
|
|
return hashlib.sha256(content.encode()).hexdigest()
|
|
|
|
|
|
def get_from_cache(cache_key: str) -> Optional[str]:
|
|
"""Check if audio exists in cache"""
|
|
if not CACHE_ENABLED:
|
|
return None
|
|
cache_path = CACHE_DIR / f"{cache_key}.wav"
|
|
if cache_path.exists():
|
|
print(f"Cache hit: {cache_key}")
|
|
return str(cache_path)
|
|
return None
|
|
|
|
|
|
def save_to_cache(cache_key: str, audio_path: str):
|
|
"""Save generated audio to cache"""
|
|
if not CACHE_ENABLED:
|
|
return
|
|
try:
|
|
import shutil
|
|
cache_path = CACHE_DIR / f"{cache_key}.wav"
|
|
shutil.copy(audio_path, cache_path)
|
|
print(f"Saved to cache: {cache_key}")
|
|
except Exception as e:
|
|
print(f"Error saving to cache: {e}")
|
|
|
|
|
|
def get_custom_voices() -> List[str]:
|
|
"""Get list of custom cloned voices"""
|
|
voices = []
|
|
for voice_file in VOICES_DIR.glob("*.wav"):
|
|
voices.append(voice_file.stem)
|
|
return voices
|
|
|
|
|
|
def chunk_text(text: str, max_chars: int = 800) -> List[str]:
|
|
"""
|
|
Split text into chunks at sentence boundaries for sequential TTS generation.
|
|
|
|
Ensures no chunk exceeds max_chars (unless a single sentence is longer).
|
|
Preserves emotion tags within sentences.
|
|
"""
|
|
# Split on sentence-ending punctuation followed by whitespace
|
|
sentences = re.split(r'(?<=[.!?])\s+', text.strip())
|
|
|
|
chunks = []
|
|
current_chunk = []
|
|
current_len = 0
|
|
|
|
for sentence in sentences:
|
|
sentence_len = len(sentence)
|
|
# If adding this sentence would exceed the limit, finalize current chunk
|
|
if current_chunk and current_len + 1 + sentence_len > max_chars:
|
|
chunks.append(' '.join(current_chunk))
|
|
current_chunk = []
|
|
current_len = 0
|
|
current_chunk.append(sentence)
|
|
current_len += sentence_len + (1 if current_len > 0 else 0)
|
|
|
|
# Don't forget the last chunk
|
|
if current_chunk:
|
|
chunks.append(' '.join(current_chunk))
|
|
|
|
return chunks if chunks else [text]
|
|
|
|
|
|
def generate_speech_sync(text: str, voice: str) -> bytes:
|
|
"""
|
|
Generate speech using Orpheus model (synchronous).
|
|
|
|
Args:
|
|
text: Text to convert (may include emotion tags)
|
|
voice: Voice name (built-in or custom)
|
|
|
|
Returns:
|
|
WAV audio bytes
|
|
"""
|
|
global model
|
|
import numpy as np
|
|
|
|
# Check if it's a custom voice (needs reference audio)
|
|
custom_voice_path = VOICES_DIR / f"{voice}.wav"
|
|
|
|
if custom_voice_path.exists():
|
|
print(f"Custom voice '{voice}' - voice cloning to be implemented")
|
|
voice = DEFAULT_VOICE
|
|
elif voice not in BUILTIN_VOICES:
|
|
print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
|
|
voice = DEFAULT_VOICE
|
|
|
|
# Split long text into chunks for sequential generation
|
|
text_chunks = chunk_text(text)
|
|
print(f"Generating: {text} ({len(text_chunks)} chunk(s))")
|
|
|
|
all_pcm = []
|
|
|
|
for chunk_idx, chunk in enumerate(text_chunks):
|
|
print(f"Chunk {chunk_idx + 1}/{len(text_chunks)}: {chunk[:80]}...")
|
|
audio_chunks = []
|
|
|
|
syn_tokens = model.generate_speech(
|
|
prompt=chunk,
|
|
voice=voice,
|
|
max_tokens=MAX_TOKENS,
|
|
)
|
|
|
|
for i, audio_chunk in enumerate(syn_tokens):
|
|
audio_chunks.append(audio_chunk)
|
|
|
|
print(f" -> {len(audio_chunks)} audio frames")
|
|
|
|
if audio_chunks:
|
|
all_pcm.append(b''.join(audio_chunks))
|
|
|
|
if not all_pcm:
|
|
raise ValueError("No audio chunks generated")
|
|
|
|
# Concatenate raw PCM from all chunks, wrap in single WAV
|
|
audio_bytes_raw = b''.join(all_pcm)
|
|
|
|
buffer = io.BytesIO()
|
|
with wave.open(buffer, 'wb') as wf:
|
|
wf.setnchannels(1)
|
|
wf.setsampwidth(2) # 16-bit
|
|
wf.setframerate(SAMPLE_RATE)
|
|
wf.writeframes(audio_bytes_raw)
|
|
|
|
print(f"Generated WAV: {len(buffer.getvalue())} bytes")
|
|
return buffer.getvalue()
|
|
|
|
|
|
def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str:
|
|
"""Save audio bytes to WAV file."""
|
|
output_path = OUTPUT_DIR / f"{job_id}.wav"
|
|
with open(output_path, 'wb') as f:
|
|
f.write(audio_bytes)
|
|
return str(output_path)
|
|
|
|
|
|
async def generate_speech_background(job_id: str, text: str, voice: str):
|
|
"""Background task for speech generation (async)."""
|
|
try:
|
|
jobs[job_id].status = JobStatus.PROCESSING
|
|
jobs[job_id].progress = 25
|
|
save_jobs_to_disk()
|
|
|
|
# Check cache first
|
|
cache_key = hash_text_voice(text, voice)
|
|
cached_path = get_from_cache(cache_key)
|
|
|
|
if cached_path:
|
|
jobs[job_id].audio_path = cached_path
|
|
jobs[job_id].status = JobStatus.SUCCESS
|
|
jobs[job_id].progress = 100
|
|
jobs[job_id].cached = True
|
|
jobs[job_id].completed_at = datetime.now().isoformat()
|
|
save_jobs_to_disk()
|
|
print(f"Job {job_id} completed from cache")
|
|
return
|
|
|
|
# Generate audio - call sync function directly (blocks but let's test if it works)
|
|
jobs[job_id].progress = 50
|
|
save_jobs_to_disk()
|
|
|
|
print(f"Generating audio for job {job_id}...")
|
|
audio_bytes = generate_speech_sync(text, voice)
|
|
|
|
# Save to file
|
|
jobs[job_id].progress = 75
|
|
save_jobs_to_disk()
|
|
|
|
output_path = save_audio_to_file(job_id, audio_bytes)
|
|
|
|
# Save to cache
|
|
save_to_cache(cache_key, output_path)
|
|
|
|
# Complete
|
|
jobs[job_id].audio_path = output_path
|
|
jobs[job_id].status = JobStatus.SUCCESS
|
|
jobs[job_id].progress = 100
|
|
jobs[job_id].completed_at = datetime.now().isoformat()
|
|
save_jobs_to_disk()
|
|
|
|
print(f"Job {job_id} completed successfully")
|
|
|
|
except Exception as e:
|
|
print(f"Job {job_id} failed: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
jobs[job_id].status = JobStatus.FAILURE
|
|
jobs[job_id].error = str(e)
|
|
save_jobs_to_disk()
|
|
|
|
|
|
async def cleanup_old_jobs():
|
|
"""Background task to cleanup old jobs and files."""
|
|
while True:
|
|
try:
|
|
await asyncio.sleep(CLEANUP_INTERVAL_HOURS * 3600)
|
|
cutoff = datetime.now() - timedelta(days=RETENTION_DAYS)
|
|
|
|
to_delete = []
|
|
for job_id, job in jobs.items():
|
|
try:
|
|
created = datetime.fromisoformat(job.created_at)
|
|
if created < cutoff:
|
|
if job.audio_path and Path(job.audio_path).exists():
|
|
Path(job.audio_path).unlink()
|
|
to_delete.append(job_id)
|
|
except:
|
|
pass
|
|
|
|
for job_id in to_delete:
|
|
del jobs[job_id]
|
|
|
|
if to_delete:
|
|
save_jobs_to_disk()
|
|
print(f"Cleanup: deleted {len(to_delete)} old jobs")
|
|
|
|
except Exception as e:
|
|
print(f"Error in cleanup task: {e}")
|
|
|
|
|
|
@app.on_event("startup")
|
|
async def startup():
|
|
"""Load model and jobs on startup"""
|
|
global model
|
|
|
|
print("=" * 60)
|
|
print("OrpheusTail - Orpheus TTS Service Starting")
|
|
print(f"Model: {ORPHEUS_MODEL}")
|
|
print(f"Max Model Len: {MAX_MODEL_LEN}")
|
|
print(f"Max Tokens: {MAX_TOKENS}")
|
|
print(f"Cache: {'Enabled' if CACHE_ENABLED else 'Disabled'}")
|
|
print(f"Default Voice: {DEFAULT_VOICE}")
|
|
print("=" * 60)
|
|
|
|
# Import and load Orpheus model
|
|
print("Loading Orpheus model (this may take a moment)...")
|
|
from orpheus_tts import OrpheusModel
|
|
from vllm import AsyncLLMEngine
|
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
|
|
|
# Monkey-patch OrpheusModel to use AsyncLLMEngine for true streaming
|
|
original_setup_engine = OrpheusModel._setup_engine
|
|
def patched_setup_engine(self):
|
|
model_name = self._map_model_params(self.model_name)
|
|
engine_args = AsyncEngineArgs(
|
|
model=model_name,
|
|
max_model_len=MAX_MODEL_LEN,
|
|
gpu_memory_utilization=0.85,
|
|
enforce_eager=False,
|
|
)
|
|
return AsyncLLMEngine.from_engine_args(engine_args)
|
|
OrpheusModel._setup_engine = patched_setup_engine
|
|
|
|
# Sync token generation (for background jobs)
|
|
# Uses the async engine but collects all results synchronously
|
|
def patched_generate_tokens_sync(self, prompt, voice=None, request_id="req-001",
|
|
temperature=0.6, top_p=0.8, max_tokens=MAX_TOKENS,
|
|
stop_token_ids=[49158], repetition_penalty=1.3):
|
|
from vllm import SamplingParams
|
|
import re
|
|
prompt_string = self._format_prompt(prompt, voice)
|
|
print(prompt)
|
|
sampling_params = SamplingParams(
|
|
temperature=temperature, top_p=top_p, max_tokens=max_tokens,
|
|
stop_token_ids=stop_token_ids, repetition_penalty=repetition_penalty,
|
|
)
|
|
req_id = f"sync-{uuid.uuid4().hex[:8]}"
|
|
|
|
# Collect from async engine — use the running loop if available, else create one
|
|
async def _collect_all():
|
|
final = None
|
|
async for output in self.engine.generate(prompt_string, sampling_params, req_id):
|
|
final = output
|
|
return final
|
|
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
# We're in an async context (background task) — use asyncio.ensure_future
|
|
import concurrent.futures
|
|
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
final_output = pool.submit(lambda: asyncio.run(_collect_all())).result()
|
|
except RuntimeError:
|
|
# No running loop — safe to use asyncio.run
|
|
final_output = asyncio.run(_collect_all())
|
|
|
|
if final_output:
|
|
text = final_output.outputs[0].text
|
|
print(f"Raw output (first 500 chars): {text[:500]}")
|
|
tokens = re.findall(r'<custom_token_\d+>', text)
|
|
print(f"Found {len(tokens)} tokens")
|
|
for token in tokens:
|
|
yield token
|
|
OrpheusModel.generate_tokens_sync = patched_generate_tokens_sync
|
|
|
|
# Async streaming token generation (for /tts/stream — yields tokens as produced)
|
|
async def async_generate_tokens(model_instance, prompt, voice=None,
|
|
temperature=0.6, top_p=0.8, max_tokens=MAX_TOKENS,
|
|
stop_token_ids=[49158], repetition_penalty=1.3):
|
|
"""Async generator: yields tokens incrementally as vLLM produces them."""
|
|
from vllm import SamplingParams
|
|
prompt_string = model_instance._format_prompt(prompt, voice)
|
|
print(f"[streaming] {prompt}")
|
|
sampling_params = SamplingParams(
|
|
temperature=temperature, top_p=top_p, max_tokens=max_tokens,
|
|
stop_token_ids=stop_token_ids, repetition_penalty=repetition_penalty,
|
|
)
|
|
request_id = f"stream-{uuid.uuid4().hex[:8]}"
|
|
prev_text_len = 0
|
|
|
|
async for output in model_instance.engine.generate(prompt_string, sampling_params, request_id):
|
|
# output.outputs[0].text grows incrementally
|
|
text = output.outputs[0].text
|
|
new_text = text[prev_text_len:]
|
|
prev_text_len = len(text)
|
|
# Extract new tokens from the incremental text
|
|
new_tokens = re.findall(r'<custom_token_\d+>', new_text)
|
|
for token in new_tokens:
|
|
yield token
|
|
|
|
model = OrpheusModel(model_name=ORPHEUS_MODEL)
|
|
|
|
print("✓ Orpheus model loaded successfully")
|
|
|
|
# Load jobs from disk
|
|
load_jobs_from_disk()
|
|
|
|
# Start cleanup task
|
|
asyncio.create_task(cleanup_old_jobs())
|
|
|
|
|
|
# === Pydantic Models ===
|
|
|
|
class TTSRequest(BaseModel):
|
|
"""TTS job submission request"""
|
|
text: str
|
|
voice: str = DEFAULT_VOICE
|
|
|
|
|
|
class TTSStreamRequest(BaseModel):
|
|
"""TTS streaming request (for head playback)"""
|
|
text: str
|
|
voice: str = DEFAULT_VOICE
|
|
|
|
|
|
class JobResponse(BaseModel):
|
|
"""Job submission response"""
|
|
job_id: str
|
|
status: str
|
|
|
|
|
|
class StatusResponse(BaseModel):
|
|
"""Job status response"""
|
|
job_id: str
|
|
status: str
|
|
progress: int
|
|
cached: bool = False
|
|
audio_url: Optional[str] = None
|
|
error: Optional[str] = None
|
|
|
|
|
|
class VoicesResponse(BaseModel):
|
|
"""Available voices response"""
|
|
builtin: List[str]
|
|
custom: List[str]
|
|
default: str
|
|
emotion_tags: List[str]
|
|
|
|
|
|
# === Endpoints ===
|
|
|
|
@app.get("/")
|
|
def root():
|
|
"""Root endpoint"""
|
|
return {
|
|
"service": "OrpheusTail - Orpheus TTS Service",
|
|
"version": "1.0.0",
|
|
"model": ORPHEUS_MODEL,
|
|
"default_voice": DEFAULT_VOICE,
|
|
"emotion_tags": EMOTION_TAGS,
|
|
"endpoints": {
|
|
"/tts/submit": "POST - Submit TTS job",
|
|
"/tts/status/{job_id}": "GET - Check job status",
|
|
"/tts/audio/{job_id}": "GET - Download audio",
|
|
"/tts/stream": "POST - Stream audio (for head)",
|
|
"/voice/clone": "POST - Upload voice reference",
|
|
"/voices": "GET - List available voices",
|
|
"/health": "GET - Health check"
|
|
}
|
|
}
|
|
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
"""Health check"""
|
|
return {
|
|
"status": "healthy",
|
|
"model_loaded": model is not None,
|
|
"cache_enabled": CACHE_ENABLED,
|
|
"voices_available": len(BUILTIN_VOICES) + len(get_custom_voices())
|
|
}
|
|
|
|
|
|
@app.get("/voices", response_model=VoicesResponse)
|
|
def list_voices():
|
|
"""List all available voices"""
|
|
return VoicesResponse(
|
|
builtin=BUILTIN_VOICES,
|
|
custom=get_custom_voices(),
|
|
default=DEFAULT_VOICE,
|
|
emotion_tags=EMOTION_TAGS
|
|
)
|
|
|
|
|
|
@app.post("/tts/submit", response_model=JobResponse)
|
|
async def submit_tts_job(request: TTSRequest):
|
|
"""Submit a TTS job for processing."""
|
|
job_id = str(uuid.uuid4())
|
|
|
|
job = JobInfo(
|
|
job_id=job_id,
|
|
text=request.text,
|
|
voice=request.voice,
|
|
status=JobStatus.PENDING,
|
|
progress=0,
|
|
created_at=datetime.now().isoformat()
|
|
)
|
|
|
|
jobs[job_id] = job
|
|
save_jobs_to_disk()
|
|
|
|
# Use asyncio.create_task for proper async execution
|
|
asyncio.create_task(
|
|
generate_speech_background(job_id, request.text, request.voice)
|
|
)
|
|
|
|
print(f"Job {job_id} submitted: '{request.text[:50]}...' with voice '{request.voice}'")
|
|
|
|
return JobResponse(job_id=job_id, status=JobStatus.PENDING)
|
|
|
|
|
|
@app.get("/tts/status/{job_id}", response_model=StatusResponse)
|
|
async def get_job_status(job_id: str):
|
|
"""Get status of a TTS job."""
|
|
if job_id not in jobs:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
job = jobs[job_id]
|
|
|
|
response = StatusResponse(
|
|
job_id=job_id,
|
|
status=job.status,
|
|
progress=job.progress,
|
|
cached=job.cached
|
|
)
|
|
|
|
if job.status == JobStatus.SUCCESS:
|
|
response.audio_url = f"/tts/audio/{job_id}"
|
|
elif job.status == JobStatus.FAILURE:
|
|
response.error = job.error
|
|
|
|
return response
|
|
|
|
|
|
@app.get("/tts/audio/{job_id}")
|
|
async def get_audio(job_id: str):
|
|
"""Retrieve generated audio file."""
|
|
if job_id not in jobs:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
job = jobs[job_id]
|
|
|
|
if job.status != JobStatus.SUCCESS:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail=f"Audio not ready. Job status: {job.status}"
|
|
)
|
|
|
|
if not job.audio_path or not Path(job.audio_path).exists():
|
|
raise HTTPException(status_code=404, detail="Audio file not found")
|
|
|
|
return FileResponse(
|
|
job.audio_path,
|
|
media_type="audio/wav",
|
|
filename=f"{job_id}.wav"
|
|
)
|
|
|
|
|
|
@app.post("/tts/stream")
|
|
async def stream_tts(request: TTSStreamRequest):
|
|
"""
|
|
Stream TTS audio in real-time with true token-level streaming.
|
|
|
|
Audio starts playing within ~1-2s instead of waiting for full generation.
|
|
Uses vLLM AsyncLLMEngine to yield tokens incrementally, decoded to audio
|
|
via orpheus_tts.decoder.tokens_decoder every 7 tokens.
|
|
"""
|
|
global model
|
|
|
|
if model is None:
|
|
raise HTTPException(status_code=503, detail="Model not loaded")
|
|
|
|
voice = request.voice
|
|
if voice not in BUILTIN_VOICES:
|
|
voice = DEFAULT_VOICE
|
|
|
|
async def streaming_audio_generator():
|
|
"""Async generator: vLLM tokens → SNAC decoder → PCM audio chunks."""
|
|
from orpheus_tts.decoder import tokens_decoder
|
|
try:
|
|
text_chunks = chunk_text(request.text)
|
|
for chunk_idx, chunk in enumerate(text_chunks):
|
|
print(f"Stream chunk {chunk_idx + 1}/{len(text_chunks)}: {chunk[:80]}...")
|
|
|
|
# async_generate_tokens yields tokens as vLLM produces them
|
|
# tokens_decoder converts them to audio every 7 tokens
|
|
token_gen = async_generate_tokens(model, chunk, voice=voice)
|
|
async for audio_chunk in tokens_decoder(token_gen):
|
|
yield audio_chunk
|
|
except Exception as e:
|
|
print(f"Stream error: {e}")
|
|
raise
|
|
|
|
return StreamingResponse(
|
|
streaming_audio_generator(),
|
|
media_type="audio/pcm"
|
|
)
|
|
|
|
|
|
@app.post("/voice/clone")
|
|
async def upload_voice_reference(
|
|
name: str,
|
|
audio: UploadFile = File(...),
|
|
):
|
|
"""
|
|
Upload a reference audio file for voice cloning.
|
|
|
|
Args:
|
|
name: Name for this custom voice
|
|
audio: WAV audio file (5-30 seconds recommended)
|
|
"""
|
|
if not name.isalnum():
|
|
raise HTTPException(status_code=400, detail="Voice name must be alphanumeric")
|
|
|
|
if name in BUILTIN_VOICES:
|
|
raise HTTPException(status_code=400, detail="Cannot overwrite built-in voice")
|
|
|
|
# Save the reference audio
|
|
voice_path = VOICES_DIR / f"{name}.wav"
|
|
|
|
try:
|
|
content = await audio.read()
|
|
with open(voice_path, 'wb') as f:
|
|
f.write(content)
|
|
|
|
return {
|
|
"status": "success",
|
|
"voice_name": name,
|
|
"message": f"Voice '{name}' saved. Use voice='{name}' in TTS requests."
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to save voice: {e}")
|
|
|
|
|
|
@app.delete("/voice/{name}")
|
|
async def delete_voice(name: str):
|
|
"""Delete a custom voice."""
|
|
if name in BUILTIN_VOICES:
|
|
raise HTTPException(status_code=400, detail="Cannot delete built-in voice")
|
|
|
|
voice_path = VOICES_DIR / f"{name}.wav"
|
|
if not voice_path.exists():
|
|
raise HTTPException(status_code=404, detail="Voice not found")
|
|
|
|
voice_path.unlink()
|
|
return {"status": "success", "message": f"Voice '{name}' deleted"}
|
|
|
|
|
|
@app.delete("/tts/job/{job_id}")
|
|
async def delete_job(job_id: str):
|
|
"""Delete a job and its audio file."""
|
|
if job_id not in jobs:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
|
|
job = jobs[job_id]
|
|
|
|
if job.audio_path and Path(job.audio_path).exists():
|
|
try:
|
|
Path(job.audio_path).unlink()
|
|
except:
|
|
pass
|
|
|
|
del jobs[job_id]
|
|
save_jobs_to_disk()
|
|
|
|
return {"message": f"Job {job_id} deleted"}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(
|
|
"main:app",
|
|
host="0.0.0.0",
|
|
port=8766, # Same port as VoiceTail for drop-in replacement
|
|
reload=False
|
|
)
|