diff --git a/main.py b/main.py index dccc8fe..d363ead 100644 --- a/main.py +++ b/main.py @@ -164,9 +164,9 @@ def get_custom_voices() -> List[str]: return voices -def generate_speech(text: str, voice: str) -> bytes: +async def generate_speech(text: str, voice: str) -> bytes: """ - Generate speech using Orpheus model. + Generate speech using Orpheus model (async version). Args: text: Text to convert (may include emotion tags) @@ -189,8 +189,10 @@ def generate_speech(text: str, voice: str) -> bytes: print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'") voice = DEFAULT_VOICE - # Generate speech using Orpheus - # Note: text is passed as-is, emotion tags like are handled by Orpheus + print(f"{text}") + + # Generate speech using Orpheus - async iteration! + import numpy as np audio_chunks = [] syn_tokens = model.generate_speech( @@ -198,12 +200,11 @@ def generate_speech(text: str, voice: str) -> bytes: voice=voice, ) - # Collect audio chunks - for audio_chunk in syn_tokens: + # Async iteration over the generator + async for audio_chunk in syn_tokens: audio_chunks.append(audio_chunk) # Combine chunks into single audio - import numpy as np audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] # Convert to WAV bytes @@ -225,8 +226,8 @@ def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str: return str(output_path) -def generate_speech_background(job_id: str, text: str, voice: str): - """Background task for speech generation.""" +async def generate_speech_background(job_id: str, text: str, voice: str): + """Background task for speech generation (async).""" try: jobs[job_id].status = JobStatus.PROCESSING jobs[job_id].progress = 25 @@ -251,7 +252,7 @@ def generate_speech_background(job_id: str, text: str, voice: str): save_jobs_to_disk() print(f"Generating audio for job {job_id}...") - audio_bytes = generate_speech(text, voice) + audio_bytes = await generate_speech(text, voice) # Save to file jobs[job_id].progress = 75 @@ -437,7 +438,7 @@ def list_voices(): @app.post("/tts/submit", response_model=JobResponse) -async def submit_tts_job(request: TTSRequest, background_tasks: BackgroundTasks): +async def submit_tts_job(request: TTSRequest): """Submit a TTS job for processing.""" job_id = str(uuid.uuid4()) @@ -453,11 +454,9 @@ async def submit_tts_job(request: TTSRequest, background_tasks: BackgroundTasks) jobs[job_id] = job save_jobs_to_disk() - background_tasks.add_task( - generate_speech_background, - job_id, - request.text, - request.voice + # Use asyncio.create_task for proper async execution + asyncio.create_task( + generate_speech_background(job_id, request.text, request.voice) ) print(f"Job {job_id} submitted: '{request.text[:50]}...' with voice '{request.voice}'") @@ -530,14 +529,14 @@ async def stream_tts(request: TTSStreamRequest): voice = DEFAULT_VOICE async def audio_generator(): - """Generate audio chunks""" + """Generate audio chunks (async)""" try: syn_tokens = model.generate_speech( prompt=request.text, voice=voice, ) - for audio_chunk in syn_tokens: + async for audio_chunk in syn_tokens: yield audio_chunk except Exception as e: