Debug: add verbose logging to generate_speech_sync

This commit is contained in:
2026-01-11 18:44:07 -06:00
parent 4eab3ccc01
commit 0b88188907

67
main.py
View File

@@ -164,9 +164,9 @@ def get_custom_voices() -> List[str]:
return voices
async def generate_speech(text: str, voice: str) -> bytes:
def generate_speech_sync(text: str, voice: str) -> bytes:
"""
Generate speech using Orpheus model (async wrapper).
Generate speech using Orpheus model (synchronous).
Args:
text: Text to convert (may include emotion tags)
@@ -176,51 +176,56 @@ async def generate_speech(text: str, voice: str) -> bytes:
WAV audio bytes
"""
global model
import numpy as np
# Check if it's a custom voice (needs reference audio)
custom_voice_path = VOICES_DIR / f"{voice}.wav"
if custom_voice_path.exists():
# TODO: Implement voice cloning with reference audio
# For now, fall back to built-in voice
print(f"Custom voice '{voice}' - voice cloning to be implemented")
voice = DEFAULT_VOICE
elif voice not in BUILTIN_VOICES:
print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
voice = DEFAULT_VOICE
print(f"{text}")
print(f"Generating: {text}")
# Run synchronous generation in thread pool to not block event loop
def _generate_sync():
import numpy as np
audio_chunks = []
audio_chunks = []
syn_tokens = model.generate_speech(
prompt=text,
voice=voice,
)
# Call model directly - it returns a generator
syn_tokens = model.generate_speech(
prompt=text,
voice=voice,
)
# Sync iteration - generator yields audio chunks
for audio_chunk in syn_tokens:
audio_chunks.append(audio_chunk)
print(f"Got generator: {type(syn_tokens)}")
# Combine chunks into single audio
audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
# Iterate over generator
for i, audio_chunk in enumerate(syn_tokens):
print(f"Chunk {i}: {type(audio_chunk)}, shape: {audio_chunk.shape if hasattr(audio_chunk, 'shape') else 'N/A'}")
audio_chunks.append(audio_chunk)
# Convert to WAV bytes
buffer = io.BytesIO()
with wave.open(buffer, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2) # 16-bit
wf.setframerate(SAMPLE_RATE)
wf.writeframes(audio_data)
print(f"Total chunks: {len(audio_chunks)}")
return buffer.getvalue()
# Combine chunks into single audio
audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
# Run in executor to avoid blocking
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, _generate_sync)
# Convert to WAV bytes
buffer = io.BytesIO()
with wave.open(buffer, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2) # 16-bit
wf.setframerate(SAMPLE_RATE)
# Ensure audio is int16
if audio_data.dtype != np.int16:
if audio_data.dtype in [np.float32, np.float64]:
audio_data = (audio_data * 32767).astype(np.int16)
else:
audio_data = audio_data.astype(np.int16)
wf.writeframes(audio_data.tobytes())
print(f"Generated WAV: {len(buffer.getvalue())} bytes")
return buffer.getvalue()
def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str:
@@ -252,12 +257,12 @@ async def generate_speech_background(job_id: str, text: str, voice: str):
print(f"Job {job_id} completed from cache")
return
# Generate audio
# Generate audio - call sync function directly (blocks but let's test if it works)
jobs[job_id].progress = 50
save_jobs_to_disk()
print(f"Generating audio for job {job_id}...")
audio_bytes = await generate_speech(text, voice)
audio_bytes = generate_speech_sync(text, voice)
# Save to file
jobs[job_id].progress = 75