From 0b8818890716221d002c7b4d4db5d43a7278688a Mon Sep 17 00:00:00 2001 From: vixy Date: Sun, 11 Jan 2026 18:44:07 -0600 Subject: [PATCH] Debug: add verbose logging to generate_speech_sync --- main.py | 77 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/main.py b/main.py index 6a55b35..abb3c28 100644 --- a/main.py +++ b/main.py @@ -164,9 +164,9 @@ def get_custom_voices() -> List[str]: return voices -async def generate_speech(text: str, voice: str) -> bytes: +def generate_speech_sync(text: str, voice: str) -> bytes: """ - Generate speech using Orpheus model (async wrapper). + Generate speech using Orpheus model (synchronous). Args: text: Text to convert (may include emotion tags) @@ -176,51 +176,56 @@ async def generate_speech(text: str, voice: str) -> bytes: WAV audio bytes """ global model + import numpy as np # Check if it's a custom voice (needs reference audio) custom_voice_path = VOICES_DIR / f"{voice}.wav" if custom_voice_path.exists(): - # TODO: Implement voice cloning with reference audio - # For now, fall back to built-in voice print(f"Custom voice '{voice}' - voice cloning to be implemented") voice = DEFAULT_VOICE elif voice not in BUILTIN_VOICES: print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'") voice = DEFAULT_VOICE - print(f"{text}") + print(f"Generating: {text}") - # Run synchronous generation in thread pool to not block event loop - def _generate_sync(): - import numpy as np - audio_chunks = [] - - syn_tokens = model.generate_speech( - prompt=text, - voice=voice, - ) - - # Sync iteration - generator yields audio chunks - for audio_chunk in syn_tokens: - audio_chunks.append(audio_chunk) - - # Combine chunks into single audio - audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] - - # Convert to WAV bytes - buffer = io.BytesIO() - with wave.open(buffer, 'wb') as wf: - wf.setnchannels(1) - wf.setsampwidth(2) # 16-bit - wf.setframerate(SAMPLE_RATE) - wf.writeframes(audio_data) - - return buffer.getvalue() + audio_chunks = [] - # Run in executor to avoid blocking - loop = asyncio.get_event_loop() - return await loop.run_in_executor(None, _generate_sync) + # Call model directly - it returns a generator + syn_tokens = model.generate_speech( + prompt=text, + voice=voice, + ) + + print(f"Got generator: {type(syn_tokens)}") + + # Iterate over generator + for i, audio_chunk in enumerate(syn_tokens): + print(f"Chunk {i}: {type(audio_chunk)}, shape: {audio_chunk.shape if hasattr(audio_chunk, 'shape') else 'N/A'}") + audio_chunks.append(audio_chunk) + + print(f"Total chunks: {len(audio_chunks)}") + + # Combine chunks into single audio + audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] + + # Convert to WAV bytes + buffer = io.BytesIO() + with wave.open(buffer, 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) # 16-bit + wf.setframerate(SAMPLE_RATE) + # Ensure audio is int16 + if audio_data.dtype != np.int16: + if audio_data.dtype in [np.float32, np.float64]: + audio_data = (audio_data * 32767).astype(np.int16) + else: + audio_data = audio_data.astype(np.int16) + wf.writeframes(audio_data.tobytes()) + + print(f"Generated WAV: {len(buffer.getvalue())} bytes") + return buffer.getvalue() def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str: @@ -252,12 +257,12 @@ async def generate_speech_background(job_id: str, text: str, voice: str): print(f"Job {job_id} completed from cache") return - # Generate audio + # Generate audio - call sync function directly (blocks but let's test if it works) jobs[job_id].progress = 50 save_jobs_to_disk() print(f"Generating audio for job {job_id}...") - audio_bytes = await generate_speech(text, voice) + audio_bytes = generate_speech_sync(text, voice) # Save to file jobs[job_id].progress = 75