Debug: add verbose logging to generate_speech_sync
This commit is contained in:
39
main.py
39
main.py
@@ -164,9 +164,9 @@ def get_custom_voices() -> List[str]:
|
|||||||
return voices
|
return voices
|
||||||
|
|
||||||
|
|
||||||
async def generate_speech(text: str, voice: str) -> bytes:
|
def generate_speech_sync(text: str, voice: str) -> bytes:
|
||||||
"""
|
"""
|
||||||
Generate speech using Orpheus model (async wrapper).
|
Generate speech using Orpheus model (synchronous).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: Text to convert (may include emotion tags)
|
text: Text to convert (may include emotion tags)
|
||||||
@@ -176,35 +176,37 @@ async def generate_speech(text: str, voice: str) -> bytes:
|
|||||||
WAV audio bytes
|
WAV audio bytes
|
||||||
"""
|
"""
|
||||||
global model
|
global model
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
# Check if it's a custom voice (needs reference audio)
|
# Check if it's a custom voice (needs reference audio)
|
||||||
custom_voice_path = VOICES_DIR / f"{voice}.wav"
|
custom_voice_path = VOICES_DIR / f"{voice}.wav"
|
||||||
|
|
||||||
if custom_voice_path.exists():
|
if custom_voice_path.exists():
|
||||||
# TODO: Implement voice cloning with reference audio
|
|
||||||
# For now, fall back to built-in voice
|
|
||||||
print(f"Custom voice '{voice}' - voice cloning to be implemented")
|
print(f"Custom voice '{voice}' - voice cloning to be implemented")
|
||||||
voice = DEFAULT_VOICE
|
voice = DEFAULT_VOICE
|
||||||
elif voice not in BUILTIN_VOICES:
|
elif voice not in BUILTIN_VOICES:
|
||||||
print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
|
print(f"Unknown voice '{voice}', using default '{DEFAULT_VOICE}'")
|
||||||
voice = DEFAULT_VOICE
|
voice = DEFAULT_VOICE
|
||||||
|
|
||||||
print(f"{text}")
|
print(f"Generating: {text}")
|
||||||
|
|
||||||
# Run synchronous generation in thread pool to not block event loop
|
|
||||||
def _generate_sync():
|
|
||||||
import numpy as np
|
|
||||||
audio_chunks = []
|
audio_chunks = []
|
||||||
|
|
||||||
|
# Call model directly - it returns a generator
|
||||||
syn_tokens = model.generate_speech(
|
syn_tokens = model.generate_speech(
|
||||||
prompt=text,
|
prompt=text,
|
||||||
voice=voice,
|
voice=voice,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Sync iteration - generator yields audio chunks
|
print(f"Got generator: {type(syn_tokens)}")
|
||||||
for audio_chunk in syn_tokens:
|
|
||||||
|
# Iterate over generator
|
||||||
|
for i, audio_chunk in enumerate(syn_tokens):
|
||||||
|
print(f"Chunk {i}: {type(audio_chunk)}, shape: {audio_chunk.shape if hasattr(audio_chunk, 'shape') else 'N/A'}")
|
||||||
audio_chunks.append(audio_chunk)
|
audio_chunks.append(audio_chunk)
|
||||||
|
|
||||||
|
print(f"Total chunks: {len(audio_chunks)}")
|
||||||
|
|
||||||
# Combine chunks into single audio
|
# Combine chunks into single audio
|
||||||
audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
|
audio_data = np.concatenate(audio_chunks) if len(audio_chunks) > 1 else audio_chunks[0]
|
||||||
|
|
||||||
@@ -214,14 +216,17 @@ async def generate_speech(text: str, voice: str) -> bytes:
|
|||||||
wf.setnchannels(1)
|
wf.setnchannels(1)
|
||||||
wf.setsampwidth(2) # 16-bit
|
wf.setsampwidth(2) # 16-bit
|
||||||
wf.setframerate(SAMPLE_RATE)
|
wf.setframerate(SAMPLE_RATE)
|
||||||
wf.writeframes(audio_data)
|
# Ensure audio is int16
|
||||||
|
if audio_data.dtype != np.int16:
|
||||||
|
if audio_data.dtype in [np.float32, np.float64]:
|
||||||
|
audio_data = (audio_data * 32767).astype(np.int16)
|
||||||
|
else:
|
||||||
|
audio_data = audio_data.astype(np.int16)
|
||||||
|
wf.writeframes(audio_data.tobytes())
|
||||||
|
|
||||||
|
print(f"Generated WAV: {len(buffer.getvalue())} bytes")
|
||||||
return buffer.getvalue()
|
return buffer.getvalue()
|
||||||
|
|
||||||
# Run in executor to avoid blocking
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
return await loop.run_in_executor(None, _generate_sync)
|
|
||||||
|
|
||||||
|
|
||||||
def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str:
|
def save_audio_to_file(job_id: str, audio_bytes: bytes) -> str:
|
||||||
"""Save audio bytes to WAV file."""
|
"""Save audio bytes to WAV file."""
|
||||||
@@ -252,12 +257,12 @@ async def generate_speech_background(job_id: str, text: str, voice: str):
|
|||||||
print(f"Job {job_id} completed from cache")
|
print(f"Job {job_id} completed from cache")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Generate audio
|
# Generate audio - call sync function directly (blocks but let's test if it works)
|
||||||
jobs[job_id].progress = 50
|
jobs[job_id].progress = 50
|
||||||
save_jobs_to_disk()
|
save_jobs_to_disk()
|
||||||
|
|
||||||
print(f"Generating audio for job {job_id}...")
|
print(f"Generating audio for job {job_id}...")
|
||||||
audio_bytes = await generate_speech(text, voice)
|
audio_bytes = generate_speech_sync(text, voice)
|
||||||
|
|
||||||
# Save to file
|
# Save to file
|
||||||
jobs[job_id].progress = 75
|
jobs[job_id].progress = 75
|
||||||
|
|||||||
Reference in New Issue
Block a user