Debug: add verbose logging to generate_speech_sync

2026-01-11 18:44:07 -06:00
parent 4eab3ccc01
commit 0b88188907
1 changed files with 41 additions and 36 deletions
--- a/main.py
+++ b/main.py
@@ -164,9 +164,9 @@ def get_custom_voices() -> List[str]:
    return voices
-async def generate_speech(text: str, voice: str) -> bytes:
+def generate_speech_sync(text: str, voice: str) -> bytes:
    """
-    Generate speech using Orpheus model (async wrapper).
+    Generate speech using Orpheus model (synchronous).
    Args:
        text: Text to convert (may include emotion tags)
@@ -176,35 +176,37 @@ async def generate_speech(text: str, voice: str) -> bytes:
        WAV audio bytes
    """
    global model
    import numpy as np
    # Check if it's a custom voice (needs reference audio)
    custom_voice_path = VOICES_DIR / f"{voice}.wav"
    if custom_voice_path.exists():
        # TODO: Implement voice cloning with reference audio
        # For now, fall back to built-in voice
        print(f"Custom voice '{voice}' - voice cloning to be implemented")
        voice = DEFAULT_VOICE
    elif voice not in BUILTIN_VOICES:
        print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
        voice = DEFAULT_VOICE
-    print(f"{text}")
+    print(f"Generating: {text}")
    # Run synchronous generation in thread pool to not block event loop
    def _generate_sync():
        import numpy as np
    audio_chunks = []
    # Call model directly - it returns a generator
    syn_tokens = model.generate_speech(
        prompt=text,
        voice=voice,
    )
-        # Sync iteration - generator yields audio chunks
+    print(f"Got generator: {type(syn_tokens)}")
-        for audio_chunk in syn_tokens:
+    
    # Iterate over generator
    for i, audio_chunk in enumerate(syn_tokens):
        print(f"Chunk {i}: {type(audio_chunk)}, shape: {audio_chunk.shape if hasattr(audio_chunk, 'shape') else 'N/A'}")
        audio_chunks.append(audio_chunk)
    print(f"Total chunks: {len(audio_chunks)}")
    # Combine chunks into single audio
    audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
@@ -214,14 +216,17 @@ async def generate_speech(text: str, voice: str) -> bytes:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 16-bit
        wf.setframerate(SAMPLE_RATE)
-            wf.writeframes(audio_data)
+        # Ensure audio is int16
        if audio_data.dtype != np.int16:
            if audio_data.dtype in [np.float32, np.float64]:
                audio_data = (audio_data * 32767).astype(np.int16)
            else:
                audio_data = audio_data.astype(np.int16)
        wf.writeframes(audio_data.tobytes())
    print(f"Generated WAV: {len(buffer.getvalue())} bytes")
    return buffer.getvalue()
    # Run in executor to avoid blocking
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(None, _generate_sync)
 def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str:
    """Save audio bytes to WAV file."""
@@ -252,12 +257,12 @@ async def generate_speech_background(job_id: str, text: str, voice: str):
            print(f"Job {job_id} completed from cache")
            return
-        # Generate audio
+        # Generate audio - call sync function directly (blocks but let's test if it works)
        jobs[job_id].progress = 50
        save_jobs_to_disk()
        print(f"Generating audio for job {job_id}...")
-        audio_bytes = await generate_speech(text, voice)
+        audio_bytes = generate_speech_sync(text, voice)
        # Save to file
        jobs[job_id].progress = 75