diff --git a/main.py b/main.py index d363ead..6a55b35 100644 --- a/main.py +++ b/main.py @@ -166,7 +166,7 @@ def get_custom_voices() -> List[str]: async def generate_speech(text: str, voice: str) -> bytes: """ - Generate speech using Orpheus model (async version). + Generate speech using Orpheus model (async wrapper). Args: text: Text to convert (may include emotion tags) @@ -191,31 +191,36 @@ async def generate_speech(text: str, voice: str) -> bytes: print(f"{text}") - # Generate speech using Orpheus - async iteration! - import numpy as np - audio_chunks = [] + # Run synchronous generation in thread pool to not block event loop + def _generate_sync(): + import numpy as np + audio_chunks = [] + + syn_tokens = model.generate_speech( + prompt=text, + voice=voice, + ) + + # Sync iteration - generator yields audio chunks + for audio_chunk in syn_tokens: + audio_chunks.append(audio_chunk) + + # Combine chunks into single audio + audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] + + # Convert to WAV bytes + buffer = io.BytesIO() + with wave.open(buffer, 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) # 16-bit + wf.setframerate(SAMPLE_RATE) + wf.writeframes(audio_data) + + return buffer.getvalue() - syn_tokens = model.generate_speech( - prompt=text, - voice=voice, - ) - - # Async iteration over the generator - async for audio_chunk in syn_tokens: - audio_chunks.append(audio_chunk) - - # Combine chunks into single audio - audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] - - # Convert to WAV bytes - buffer = io.BytesIO() - with wave.open(buffer, 'wb') as wf: - wf.setnchannels(1) - wf.setsampwidth(2) # 16-bit - wf.setframerate(SAMPLE_RATE) - wf.writeframes(audio_data) - - return buffer.getvalue() + # Run in executor to avoid blocking + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, _generate_sync) def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str: @@ -528,15 +533,15 @@ async def stream_tts(request: TTSStreamRequest): if voice not in BUILTIN_VOICES: voice = DEFAULT_VOICE - async def audio_generator(): - """Generate audio chunks (async)""" + def sync_audio_generator(): + """Generate audio chunks (sync generator)""" try: syn_tokens = model.generate_speech( prompt=request.text, voice=voice, ) - async for audio_chunk in syn_tokens: + for audio_chunk in syn_tokens: yield audio_chunk except Exception as e: @@ -544,7 +549,7 @@ async def stream_tts(request: TTSStreamRequest): raise return StreamingResponse( - audio_generator(), + sync_audio_generator(), media_type="audio/wav" )