Fix audio assembly - chunks are already bytes from SNAC decoder

This commit is contained in:
2026-01-11 19:47:19 -06:00
parent fe43eda6bd
commit 96cd33732d

16
main.py
View File

@@ -207,8 +207,12 @@ def generate_speech_sync(text: str, voice: str) -> bytes:
print(f"Total chunks: {len(audio_chunks)}") print(f"Total chunks: {len(audio_chunks)}")
# Combine chunks into single audio # Chunks are raw int16 bytes from SNAC decoder - just concatenate
audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0] if len(audio_chunks) == 0:
raise ValueError("No audio chunks generated")
# Concatenate bytes directly
audio_bytes_raw = b''.join(audio_chunks)
# Convert to WAV bytes # Convert to WAV bytes
buffer = io.BytesIO() buffer = io.BytesIO()
@@ -216,13 +220,7 @@ def generate_speech_sync(text: str, voice: str) -> bytes:
wf.setnchannels(1) wf.setnchannels(1)
wf.setsampwidth(2) # 16-bit wf.setsampwidth(2) # 16-bit
wf.setframerate(SAMPLE_RATE) wf.setframerate(SAMPLE_RATE)
# Ensure audio is int16 wf.writeframes(audio_bytes_raw)
if audio_data.dtype != np.int16:
if audio_data.dtype in [np.float32, np.float64]:
audio_data = (audio_data * 32767).astype(np.int16)
else:
audio_data = audio_data.astype(np.int16)
wf.writeframes(audio_data.tobytes())
print(f"Generated WAV: {len(buffer.getvalue())} bytes") print(f"Generated WAV: {len(buffer.getvalue())} bytes")
return buffer.getvalue() return buffer.getvalue()