Fix audio assembly - chunks are already bytes from SNAC decoder

2026-01-11 19:47:19 -06:00
parent fe43eda6bd
commit 96cd33732d
1 changed files with 7 additions and 9 deletions
--- a/main.py
+++ b/main.py
@@ -207,8 +207,12 @@ def generate_speech_sync(text: str, voice: str) -> bytes:
    
    print(f"Total chunks: {len(audio_chunks)}")
    
-    # Combine chunks into single audio
-    audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
+    # Chunks are raw int16 bytes from SNAC decoder - just concatenate
+    if len(audio_chunks) == 0:
+        raise ValueError("No audio chunks generated")
+    
+    # Concatenate bytes directly
+    audio_bytes_raw = b''.join(audio_chunks)
    
    # Convert to WAV bytes
    buffer = io.BytesIO()
@@ -216,13 +220,7 @@ def generate_speech_sync(text: str, voice: str) -> bytes:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 16-bit
        wf.setframerate(SAMPLE_RATE)
-        # Ensure audio is int16
-        if audio_data.dtype != np.int16:
-            if audio_data.dtype in [np.float32, np.float64]:
-                audio_data = (audio_data * 32767).astype(np.int16)
-            else:
-                audio_data = audio_data.astype(np.int16)
-        wf.writeframes(audio_data.tobytes())
+        wf.writeframes(audio_bytes_raw)
    
    print(f"Generated WAV: {len(buffer.getvalue())} bytes")
    return buffer.getvalue()