From 5ed2c6aee73e942331b1f31eead918d01a222167 Mon Sep 17 00:00:00 2001 From: vixy Date: Sat, 17 Jan 2026 11:17:17 -0600 Subject: [PATCH] Fix: Use arecord for shared audio stream - Replaced PyAudio with direct ALSA (arecord subprocess) - Single audio stream feeds both Porcupine and recording buffer - Fixes device unavailable error when recording after wake word - Simplified architecture --- headmic.py | 510 +++++++++++++++++++++-------------------------------- 1 file changed, 198 insertions(+), 312 deletions(-) diff --git a/headmic.py b/headmic.py index 4910b9f..fb281a5 100644 --- a/headmic.py +++ b/headmic.py @@ -7,33 +7,37 @@ Runs on head-vixy (Raspberry Pi 5). Wake word: "Hey Vivi" (trained via Picovoice Porcupine) +Architecture: Single shared audio stream feeds both Porcupine (wake word) +and recording buffer. This avoids device conflicts. + Flow: - 1. Listen for "Hey Vivi" wake word (Porcupine) - 2. ReSpeaker LEDs light up (listening state) - 3. Record until silence detected (webrtcvad) - 4. Send audio to EarTail (Whisper on BigOrin) - 5. Return transcription - 6. ReSpeaker LEDs off + 1. Continuous audio stream from ReSpeaker + 2. Feed frames to Porcupine for wake word detection + 3. On "Hey Vivi" → start buffering audio + 4. Use VAD to detect end of speech + 5. Send buffer to EarTail for transcription + 6. Return to listening mode Built by Vixy on Day 77 (January 17, 2026) 💜 """ import asyncio +import collections import io import logging import os import struct +import subprocess import threading import time import wave from pathlib import Path -from typing import Optional +from typing import Optional, List import httpx import pvporcupine -import pyaudio import webrtcvad -from fastapi import FastAPI, HTTPException, BackgroundTasks +from fastapi import FastAPI, HTTPException from pydantic import BaseModel # Configure logging @@ -44,74 +48,59 @@ logger = logging.getLogger("headmic") # Configuration # ============================================================================ -# Porcupine wake word PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "") WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn") -# Audio settings SAMPLE_RATE = 16000 -CHANNELS = 1 # Mono for transcription (pick channel 0 from 4-mic array) -FRAME_LENGTH = 512 # Porcupine frame length +ALSA_DEVICE = "plughw:3,0" # ReSpeaker 4 Mic Array - card 3, device 0 -# VAD settings -VAD_AGGRESSIVENESS = 3 # 0-3, higher = more aggressive filtering -SILENCE_THRESHOLD_MS = 1500 # Stop recording after this much silence -MAX_RECORDING_SEC = 30 # Maximum recording duration +VAD_AGGRESSIVENESS = 2 # 0-3, higher = more aggressive +SILENCE_FRAMES = 50 # ~1.5 sec of silence to stop (at 30ms frames) +MAX_RECORDING_FRAMES = 1000 # ~30 sec max -# EarTail EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764") -# ReSpeaker LED control -LED_ENABLED = True - # ============================================================================ -# LED Control (ReSpeaker 4-mic array has 12 APA102 LEDs) +# LED Control # ============================================================================ try: from pixel_ring import pixel_ring - PIXEL_RING_AVAILABLE = True + LEDS_AVAILABLE = True + pixel_ring.off() except ImportError: - PIXEL_RING_AVAILABLE = False - logger.warning("pixel_ring not available - LED feedback disabled") - - -def leds_listening(): - """Set LEDs to listening state (cyan spin).""" - if PIXEL_RING_AVAILABLE and LED_ENABLED: - try: - pixel_ring.set_color_palette(0x00FFFF, 0x000000) # Cyan - pixel_ring.think() - except Exception as e: - logger.warning(f"LED error: {e}") - - -def leds_processing(): - """Set LEDs to processing state (purple pulse).""" - if PIXEL_RING_AVAILABLE and LED_ENABLED: - try: - pixel_ring.set_color_palette(0x9400D3, 0x000000) # Purple - pixel_ring.spin() - except Exception as e: - logger.warning(f"LED error: {e}") - - -def leds_off(): - """Turn off LEDs.""" - if PIXEL_RING_AVAILABLE and LED_ENABLED: - try: - pixel_ring.off() - except Exception as e: - logger.warning(f"LED error: {e}") + LEDS_AVAILABLE = False + logger.warning("pixel_ring not available") def leds_wakeup(): - """Flash LEDs on wake word detection.""" - if PIXEL_RING_AVAILABLE and LED_ENABLED: + if LEDS_AVAILABLE: try: pixel_ring.wakeup() - except Exception as e: - logger.warning(f"LED error: {e}") + except: pass + + +def leds_listening(): + if LEDS_AVAILABLE: + try: + pixel_ring.set_color_palette(0x00FFFF, 0x000000) + pixel_ring.think() + except: pass + + +def leds_processing(): + if LEDS_AVAILABLE: + try: + pixel_ring.set_color_palette(0x9400D3, 0x000000) + pixel_ring.spin() + except: pass + + +def leds_off(): + if LEDS_AVAILABLE: + try: + pixel_ring.off() + except: pass # ============================================================================ @@ -120,268 +109,243 @@ def leds_wakeup(): class ServiceState: def __init__(self): + self.running = False self.listening = False self.recording = False self.processing = False - self.last_transcription = None - self.last_wake_time = None + self.last_transcription: Optional[str] = None + self.last_wake_time: Optional[float] = None self.wake_count = 0 - self.porcupine = None - self.audio = None - self.stream = None - self.listener_thread = None - self.running = False + self.error: Optional[str] = None state = ServiceState() + # ============================================================================ -# Audio Recording with VAD +# Audio Stream using ALSA directly (arecord) # ============================================================================ -def record_until_silence(timeout_sec: float = MAX_RECORDING_SEC) -> bytes: +def read_audio_stream(): """ - Record audio until silence is detected. - Returns WAV data as bytes. + Generator that yields audio frames from ALSA using arecord. + Each frame is 512 samples (32ms at 16kHz) as required by Porcupine. """ - vad = webrtcvad.Vad(VAD_AGGRESSIVENESS) + frame_size = 512 # Porcupine requires 512 samples + bytes_per_frame = frame_size * 2 # 16-bit = 2 bytes per sample - # VAD requires specific frame sizes: 10, 20, or 30 ms - frame_duration_ms = 30 - frame_size = int(SAMPLE_RATE * frame_duration_ms / 1000) + cmd = [ + "arecord", + "-D", ALSA_DEVICE, + "-f", "S16_LE", + "-r", str(SAMPLE_RATE), + "-c", "1", # Mono + "-t", "raw", + "-q", # Quiet + "-" + ] - p = pyaudio.PyAudio() + logger.info(f"Starting audio stream: {' '.join(cmd)}") - # Find the ReSpeaker device - device_index = None - for i in range(p.get_device_count()): - info = p.get_device_info_by_index(i) - if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower(): - device_index = i - break - - if device_index is None: - # Fallback to default - logger.warning("ReSpeaker not found, using default input") - - stream = p.open( - format=pyaudio.paInt16, - channels=4, # ReSpeaker has 4 channels - rate=SAMPLE_RATE, - input=True, - input_device_index=device_index, - frames_per_buffer=frame_size + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + bufsize=bytes_per_frame ) - logger.info("Recording started...") - frames = [] - silence_frames = 0 - silence_limit = int(SILENCE_THRESHOLD_MS / frame_duration_ms) - max_frames = int(timeout_sec * 1000 / frame_duration_ms) - try: - for _ in range(max_frames): - data = stream.read(frame_size, exception_on_overflow=False) - - # Extract channel 0 (mono) from 4-channel audio - # Each sample is 2 bytes (int16), 4 channels = 8 bytes per frame - mono_data = b'' - for i in range(0, len(data), 8): # 8 bytes per sample set - mono_data += data[i:i+2] # Take first channel only - - frames.append(mono_data) - - # Check for speech - is_speech = vad.is_speech(mono_data, SAMPLE_RATE) - - if is_speech: - silence_frames = 0 - else: - silence_frames += 1 - - # Stop if enough silence after we've recorded something - if len(frames) > 10 and silence_frames >= silence_limit: - logger.info(f"Silence detected after {len(frames)} frames") + while state.running: + data = proc.stdout.read(bytes_per_frame) + if len(data) < bytes_per_frame: break - + yield data finally: - stream.stop_stream() - stream.close() - p.terminate() - - # Convert to WAV - wav_buffer = io.BytesIO() - with wave.open(wav_buffer, 'wb') as wf: - wf.setnchannels(1) - wf.setsampwidth(2) # 16-bit - wf.setframerate(SAMPLE_RATE) - wf.writeframes(b''.join(frames)) - - wav_buffer.seek(0) - return wav_buffer.read() + proc.terminate() + proc.wait() # ============================================================================ -# EarTail Integration +# EarTail Transcription # ============================================================================ async def transcribe_audio(audio_data: bytes) -> str: """Send audio to EarTail and get transcription.""" async with httpx.AsyncClient(timeout=120.0) as client: - # Submit job files = {"audio": ("recording.wav", audio_data, "audio/wav")} response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files) response.raise_for_status() job_id = response.json().get("job_id") - logger.info(f"Transcription job submitted: {job_id}") + logger.info(f"Transcription job: {job_id}") - # Poll for completion - for _ in range(60): # Max 60 seconds - status_response = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}") - status_data = status_response.json() + for _ in range(120): + status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}") + data = status.json() - if status_data.get("status") == "SUCCESS": + if data.get("status") == "SUCCESS": result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}") return result.json().get("transcription", "") - elif status_data.get("status") == "FAILURE": - raise Exception(f"Transcription failed: {status_data.get('error')}") + elif data.get("status") == "FAILURE": + raise Exception(f"Transcription failed: {data.get('error')}") await asyncio.sleep(1) raise Exception("Transcription timeout") +def transcribe_sync(audio_data: bytes) -> str: + """Synchronous wrapper for transcription.""" + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(transcribe_audio(audio_data)) + finally: + loop.close() + + # ============================================================================ -# Wake Word Listener +# Main Listener Loop # ============================================================================ -def wake_word_listener(): - """Background thread that listens for wake word.""" +def audio_to_wav(frames: List[bytes]) -> bytes: + """Convert raw audio frames to WAV format.""" + wav_buffer = io.BytesIO() + with wave.open(wav_buffer, 'wb') as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(SAMPLE_RATE) + wf.writeframes(b''.join(frames)) + wav_buffer.seek(0) + return wav_buffer.read() + + +def listener_loop(): + """Main audio processing loop.""" global state - logger.info("Starting wake word listener...") - + logger.info("Initializing Porcupine...") try: - state.porcupine = pvporcupine.create( + porcupine = pvporcupine.create( access_key=PORCUPINE_ACCESS_KEY, keyword_paths=[WAKE_WORD_PATH] ) except Exception as e: - logger.error(f"Failed to initialize Porcupine: {e}") + logger.error(f"Failed to init Porcupine: {e}") + state.error = str(e) return - state.audio = pyaudio.PyAudio() + vad = webrtcvad.Vad(VAD_AGGRESSIVENESS) - # Find ReSpeaker device - device_index = None - for i in range(state.audio.get_device_count()): - info = state.audio.get_device_info_by_index(i) - if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower(): - device_index = i - break - - state.stream = state.audio.open( - rate=state.porcupine.sample_rate, - channels=1, - format=pyaudio.paInt16, - input=True, - input_device_index=device_index, - frames_per_buffer=state.porcupine.frame_length - ) + # VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples + # Porcupine needs 512 samples. We'll use 480 for VAD. + vad_frame_size = 480 + vad_frame_bytes = vad_frame_size * 2 state.listening = True - logger.info("Wake word listener active - say 'Hey Vivi'!") + logger.info("🦊 Wake word listener active - say 'Hey Vivi'!") - while state.running: - try: - pcm = state.stream.read(state.porcupine.frame_length, exception_on_overflow=False) - pcm = struct.unpack_from("h" * state.porcupine.frame_length, pcm) + recording_buffer: List[bytes] = [] + silence_count = 0 + is_recording = False + + try: + for frame_data in read_audio_stream(): + if not state.running: + break - keyword_index = state.porcupine.process(pcm) + # Convert bytes to int16 array for Porcupine + pcm = struct.unpack_from("h" * 512, frame_data) - if keyword_index >= 0: + # Check for wake word + keyword_index = porcupine.process(pcm) + + if keyword_index >= 0 and not is_recording: logger.info("🦊 Wake word detected: 'Hey Vivi'!") state.wake_count += 1 state.last_wake_time = time.time() - # Visual feedback leds_wakeup() - time.sleep(0.3) + time.sleep(0.2) leds_listening() - # Record and transcribe + is_recording = True state.recording = True + recording_buffer = [] + silence_count = 0 + logger.info("Recording started...") + continue + + if is_recording: + recording_buffer.append(frame_data) + + # Check VAD (use first 480 samples of the 512 frame) + vad_data = frame_data[:vad_frame_bytes] try: - audio_data = record_until_silence() + is_speech = vad.is_speech(vad_data, SAMPLE_RATE) + except: + is_speech = True # Assume speech on VAD error + + if is_speech: + silence_count = 0 + else: + silence_count += 1 + + # Stop conditions + should_stop = ( + (len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or + len(recording_buffer) >= MAX_RECORDING_FRAMES + ) + + if should_stop: + logger.info(f"Recording stopped: {len(recording_buffer)} frames") + is_recording = False + state.recording = False leds_processing() - state.recording = False state.processing = True - # Transcribe (run in asyncio) - loop = asyncio.new_event_loop() - transcription = loop.run_until_complete(transcribe_audio(audio_data)) - loop.close() + try: + wav_data = audio_to_wav(recording_buffer) + transcription = transcribe_sync(wav_data) + state.last_transcription = transcription + logger.info(f"Transcription: {transcription}") + except Exception as e: + logger.error(f"Transcription error: {e}") + state.error = str(e) + finally: + state.processing = False + leds_off() - state.last_transcription = transcription - logger.info(f"Transcription: {transcription}") - - except Exception as e: - logger.error(f"Recording/transcription error: {e}") - finally: - state.recording = False - state.processing = False - leds_off() - - except Exception as e: - logger.error(f"Listener error: {e}") - time.sleep(0.1) + recording_buffer = [] - # Cleanup - if state.stream: - state.stream.close() - if state.audio: - state.audio.terminate() - if state.porcupine: - state.porcupine.delete() - - state.listening = False - logger.info("Wake word listener stopped") + except Exception as e: + logger.error(f"Listener error: {e}") + state.error = str(e) + finally: + porcupine.delete() + state.listening = False + leds_off() + logger.info("Listener stopped") # ============================================================================ -# FastAPI App +# FastAPI # ============================================================================ -app = FastAPI(title="HeadMic", description="Vixy's Ears - Wake Word + Voice Recording 🦊👂") - - -class RecordRequest(BaseModel): - duration_sec: float = 5.0 - - -class TranscribeResponse(BaseModel): - transcription: str - duration_sec: float +app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂") @app.on_event("startup") async def startup(): - """Start the wake word listener on startup.""" state.running = True - state.listener_thread = threading.Thread(target=wake_word_listener, daemon=True) - state.listener_thread.start() - logger.info("HeadMic service started") + thread = threading.Thread(target=listener_loop, daemon=True) + thread.start() + logger.info("HeadMic started") @app.on_event("shutdown") async def shutdown(): - """Stop the wake word listener on shutdown.""" state.running = False leds_off() - if state.listener_thread: - state.listener_thread.join(timeout=5) - logger.info("HeadMic service stopped") @app.get("/") @@ -389,21 +353,19 @@ async def root(): return { "service": "HeadMic", "description": "Vixy's Ears 🦊👂", - "wake_word": "Hey Vivi", - "status": "listening" if state.listening else "idle" + "wake_word": "Hey Vivi" } @app.get("/health") async def health(): return { - "healthy": state.listening, + "healthy": state.listening and not state.error, "listening": state.listening, "recording": state.recording, "processing": state.processing, "wake_count": state.wake_count, - "porcupine_loaded": state.porcupine is not None, - "eartail_url": EARTAIL_URL + "error": state.error } @@ -415,89 +377,13 @@ async def status(): "processing": state.processing, "last_transcription": state.last_transcription, "last_wake_time": state.last_wake_time, - "wake_count": state.wake_count + "wake_count": state.wake_count, + "error": state.error } -@app.post("/record") -async def record(request: RecordRequest): - """Manually record for a specified duration.""" - if state.recording: - raise HTTPException(status_code=409, detail="Already recording") - - state.recording = True - leds_listening() - - try: - # Simple timed recording (not VAD-based) - p = pyaudio.PyAudio() - frames = [] - - stream = p.open( - format=pyaudio.paInt16, - channels=1, - rate=SAMPLE_RATE, - input=True, - frames_per_buffer=1024 - ) - - for _ in range(int(SAMPLE_RATE / 1024 * request.duration_sec)): - data = stream.read(1024) - frames.append(data) - - stream.stop_stream() - stream.close() - p.terminate() - - # Convert to WAV - wav_buffer = io.BytesIO() - with wave.open(wav_buffer, 'wb') as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(SAMPLE_RATE) - wf.writeframes(b''.join(frames)) - - wav_buffer.seek(0) - return {"success": True, "size_bytes": len(wav_buffer.getvalue())} - - finally: - state.recording = False - leds_off() - - -@app.post("/transcribe") -async def transcribe_endpoint(request: RecordRequest): - """Record and transcribe.""" - if state.recording or state.processing: - raise HTTPException(status_code=409, detail="Busy") - - state.recording = True - leds_listening() - - try: - start = time.time() - audio_data = record_until_silence(timeout_sec=request.duration_sec) - - leds_processing() - state.recording = False - state.processing = True - - transcription = await transcribe_audio(audio_data) - duration = time.time() - start - - state.last_transcription = transcription - - return TranscribeResponse(transcription=transcription, duration_sec=duration) - - finally: - state.recording = False - state.processing = False - leds_off() - - @app.get("/last") -async def last_transcription(): - """Get the last transcription.""" +async def last(): return { "transcription": state.last_transcription, "wake_time": state.last_wake_time