From 0b8818890716221d002c7b4d4db5d43a7278688a Mon Sep 17 00:00:00 2001
From: vixy <vixy@k4zka.online>
Date: Sun, 11 Jan 2026 18:44:07 -0600
Subject: [PATCH] Debug: add verbose logging to generate_speech_sync

---
 main.py | 77 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/main.py b/main.py
index 6a55b35..abb3c28 100644
--- a/main.py
+++ b/main.py
@@ -164,9 +164,9 @@ def get_custom_voices() -> List[str]:
     return voices
 
 
-async def generate_speech(text: str, voice: str) -> bytes:
+def generate_speech_sync(text: str, voice: str) -> bytes:
     """
-    Generate speech using Orpheus model (async wrapper).
+    Generate speech using Orpheus model (synchronous).
     
     Args:
         text: Text to convert (may include emotion tags)
@@ -176,51 +176,56 @@ async def generate_speech(text: str, voice: str) -> bytes:
         WAV audio bytes
     """
     global model
+    import numpy as np
     
     # Check if it's a custom voice (needs reference audio)
     custom_voice_path = VOICES_DIR / f"{voice}.wav"
     
     if custom_voice_path.exists():
-        # TODO: Implement voice cloning with reference audio
-        # For now, fall back to built-in voice
         print(f"Custom voice '{voice}' - voice cloning to be implemented")
         voice = DEFAULT_VOICE
     elif voice not in BUILTIN_VOICES:
         print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
         voice = DEFAULT_VOICE
     
-    print(f"{text}")
+    print(f"Generating: {text}")
     
-    # Run synchronous generation in thread pool to not block event loop
-    def _generate_sync():
-        import numpy as np
-        audio_chunks = []
-        
-        syn_tokens = model.generate_speech(
-            prompt=text,
-            voice=voice,
-        )
-        
-        # Sync iteration - generator yields audio chunks
-        for audio_chunk in syn_tokens:
-            audio_chunks.append(audio_chunk)
-        
-        # Combine chunks into single audio
-        audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
-        
-        # Convert to WAV bytes
-        buffer = io.BytesIO()
-        with wave.open(buffer, 'wb') as wf:
-            wf.setnchannels(1)
-            wf.setsampwidth(2)  # 16-bit
-            wf.setframerate(SAMPLE_RATE)
-            wf.writeframes(audio_data)
-        
-        return buffer.getvalue()
+    audio_chunks = []
     
-    # Run in executor to avoid blocking
-    loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(None, _generate_sync)
+    # Call model directly - it returns a generator
+    syn_tokens = model.generate_speech(
+        prompt=text,
+        voice=voice,
+    )
+    
+    print(f"Got generator: {type(syn_tokens)}")
+    
+    # Iterate over generator
+    for i, audio_chunk in enumerate(syn_tokens):
+        print(f"Chunk {i}: {type(audio_chunk)}, shape: {audio_chunk.shape if hasattr(audio_chunk, 'shape') else 'N/A'}")
+        audio_chunks.append(audio_chunk)
+    
+    print(f"Total chunks: {len(audio_chunks)}")
+    
+    # Combine chunks into single audio
+    audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
+    
+    # Convert to WAV bytes
+    buffer = io.BytesIO()
+    with wave.open(buffer, 'wb') as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)  # 16-bit
+        wf.setframerate(SAMPLE_RATE)
+        # Ensure audio is int16
+        if audio_data.dtype != np.int16:
+            if audio_data.dtype in [np.float32, np.float64]:
+                audio_data = (audio_data * 32767).astype(np.int16)
+            else:
+                audio_data = audio_data.astype(np.int16)
+        wf.writeframes(audio_data.tobytes())
+    
+    print(f"Generated WAV: {len(buffer.getvalue())} bytes")
+    return buffer.getvalue()
 
 
 def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str:
@@ -252,12 +257,12 @@ async def generate_speech_background(job_id: str, text: str, voice: str):
             print(f"Job {job_id} completed from cache")
             return
 
-        # Generate audio
+        # Generate audio - call sync function directly (blocks but let's test if it works)
         jobs[job_id].progress = 50
         save_jobs_to_disk()
 
         print(f"Generating audio for job {job_id}...")
-        audio_bytes = await generate_speech(text, voice)
+        audio_bytes = generate_speech_sync(text, voice)
 
         # Save to file
         jobs[job_id].progress = 75