diff --git a/main.py b/main.py index 8264009..7932fbe 100644 --- a/main.py +++ b/main.py @@ -357,6 +357,7 @@ async def startup(): # Also patch generate_tokens_sync to work with sync LLM def patched_generate_tokens_sync(self, prompt, voice=None, request_id="req-001", temperature=0.6, top_p=0.8, max_tokens=1200, stop_token_ids=[49158], repetition_penalty=1.3): from vllm import SamplingParams + import re prompt_string = self._format_prompt(prompt, voice) print(prompt) sampling_params = SamplingParams( @@ -371,8 +372,11 @@ async def startup(): # Yield individual tokens from the output text for output in outputs: text = output.outputs[0].text - # Tokens are space-separated custom_token_XXX - for token in text.split(): + print(f"Raw output (first 500 chars): {text[:500]}") + # Extract all patterns + tokens = re.findall(r'', text) + print(f"Found {len(tokens)} tokens") + for token in tokens: yield token OrpheusModel.generate_tokens_sync = patched_generate_tokens_sync