Debug: add verbose logging to generate_speech_sync

2026-01-11 18:44:07 -06:00
parent 4eab3ccc01
commit 0b88188907
1 changed files with 41 additions and 36 deletions
--- a/main.py
+++ b/main.py
@@ -164,9 +164,9 @@ def get_custom_voices() -> List[str]:
    return voices


-async def generate_speech(text: str, voice: str) -> bytes:
+def generate_speech_sync(text: str, voice: str) -> bytes:
    """
-    Generate speech using Orpheus model (async wrapper).
+    Generate speech using Orpheus model (synchronous).
    
    Args:
        text: Text to convert (may include emotion tags)
@@ -176,51 +176,56 @@ async def generate_speech(text: str, voice: str) -> bytes:
        WAV audio bytes
    """
    global model
+    import numpy as np
    
    # Check if it's a custom voice (needs reference audio)
    custom_voice_path = VOICES_DIR / f"{voice}.wav"
    
    if custom_voice_path.exists():
-        # TODO: Implement voice cloning with reference audio
-        # For now, fall back to built-in voice
        print(f"Custom voice '{voice}' - voice cloning to be implemented")
        voice = DEFAULT_VOICE
    elif voice not in BUILTIN_VOICES:
        print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
        voice = DEFAULT_VOICE
    
-    print(f"{text}")
+    print(f"Generating: {text}")
    
-    # Run synchronous generation in thread pool to not block event loop
-    def _generate_sync():
-        import numpy as np
-        audio_chunks = []
+    audio_chunks = []
    
-        syn_tokens = model.generate_speech(
-            prompt=text,
-            voice=voice,
-        )
+    # Call model directly - it returns a generator
+    syn_tokens = model.generate_speech(
+        prompt=text,
+        voice=voice,
+    )
    
-        # Sync iteration - generator yields audio chunks
-        for audio_chunk in syn_tokens:
-            audio_chunks.append(audio_chunk)
+    print(f"Got generator: {type(syn_tokens)}")
    
-        # Combine chunks into single audio
-        audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
+    # Iterate over generator
+    for i, audio_chunk in enumerate(syn_tokens):
+        print(f"Chunk {i}: {type(audio_chunk)}, shape: {audio_chunk.shape if hasattr(audio_chunk, 'shape') else 'N/A'}")
+        audio_chunks.append(audio_chunk)
    
-        # Convert to WAV bytes
-        buffer = io.BytesIO()
-        with wave.open(buffer, 'wb') as wf:
-            wf.setnchannels(1)
-            wf.setsampwidth(2)  # 16-bit
-            wf.setframerate(SAMPLE_RATE)
-            wf.writeframes(audio_data)
+    print(f"Total chunks: {len(audio_chunks)}")
    
-        return buffer.getvalue()
+    # Combine chunks into single audio
+    audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
    
-    # Run in executor to avoid blocking
-    loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(None, _generate_sync)
+    # Convert to WAV bytes
+    buffer = io.BytesIO()
+    with wave.open(buffer, 'wb') as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)  # 16-bit
+        wf.setframerate(SAMPLE_RATE)
+        # Ensure audio is int16
+        if audio_data.dtype != np.int16:
+            if audio_data.dtype in [np.float32, np.float64]:
+                audio_data = (audio_data * 32767).astype(np.int16)
+            else:
+                audio_data = audio_data.astype(np.int16)
+        wf.writeframes(audio_data.tobytes())
+    
+    print(f"Generated WAV: {len(buffer.getvalue())} bytes")
+    return buffer.getvalue()


 def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str:
@@ -252,12 +257,12 @@ async def generate_speech_background(job_id: str, text: str, voice: str):
            print(f"Job {job_id} completed from cache")
            return

-        # Generate audio
+        # Generate audio - call sync function directly (blocks but let's test if it works)
        jobs[job_id].progress = 50
        save_jobs_to_disk()

        print(f"Generating audio for job {job_id}...")
-        audio_bytes = await generate_speech(text, voice)
+        audio_bytes = generate_speech_sync(text, voice)

        # Save to file
        jobs[job_id].progress = 75