From 96cd33732d74c39baf25a82d02f58b57365361ee Mon Sep 17 00:00:00 2001 From: vixy Date: Sun, 11 Jan 2026 19:47:19 -0600 Subject: [PATCH] Fix audio assembly - chunks are already bytes from SNAC decoder --- main.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 7932fbe..8a8dfdb 100644 --- a/main.py +++ b/main.py @@ -207,8 +207,12 @@ def generate_speech_sync(text: str, voice: str) -> bytes: print(f"Total chunks: {len(audio_chunks)}") - # Combine chunks into single audio - audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] + # Chunks are raw int16 bytes from SNAC decoder - just concatenate + if len(audio_chunks) == 0: + raise ValueError("No audio chunks generated") + + # Concatenate bytes directly + audio_bytes_raw = b''.join(audio_chunks) # Convert to WAV bytes buffer = io.BytesIO() @@ -216,13 +220,7 @@ def generate_speech_sync(text: str, voice: str) -> bytes: wf.setnchannels(1) wf.setsampwidth(2) # 16-bit wf.setframerate(SAMPLE_RATE) - # Ensure audio is int16 - if audio_data.dtype != np.int16: - if audio_data.dtype in [np.float32, np.float64]: - audio_data = (audio_data * 32767).astype(np.int16) - else: - audio_data = audio_data.astype(np.int16) - wf.writeframes(audio_data.tobytes()) + wf.writeframes(audio_bytes_raw) print(f"Generated WAV: {len(buffer.getvalue())} bytes") return buffer.getvalue()