Fix: Use arecord for shared audio stream

- Replaced PyAudio with direct ALSA (arecord subprocess) - Single audio stream feeds both Porcupine and recording buffer - Fixes device unavailable error when recording after wake word - Simplified architecture
2026-01-17 11:17:17 -06:00
parent be7e26b6e7
commit 5ed2c6aee7
1 changed files with 198 additions and 312 deletions
--- a/headmic.py
+++ b/headmic.py
@@ -7,33 +7,37 @@ Runs on head-vixy (Raspberry Pi 5).
 Wake word: "Hey Vivi" (trained via Picovoice Porcupine)
 Architecture: Single shared audio stream feeds both Porcupine (wake word)
 and recording buffer. This avoids device conflicts.
 Flow:
-  1. Listen for "Hey Vivi" wake word (Porcupine)
+  1. Continuous audio stream from ReSpeaker
-  2. ReSpeaker LEDs light up (listening state)
+  2. Feed frames to Porcupine for wake word detection
-  3. Record until silence detected (webrtcvad)
+  3. On "Hey Vivi" → start buffering audio
-  4. Send audio to EarTail (Whisper on BigOrin)
+  4. Use VAD to detect end of speech
-  5. Return transcription
+  5. Send buffer to EarTail for transcription
-  6. ReSpeaker LEDs off
+  6. Return to listening mode
 Built by Vixy on Day 77 (January 17, 2026) 💜
 """
 import asyncio
 import collections
 import io
 import logging
 import os
 import struct
 import subprocess
 import threading
 import time
 import wave
 from pathlib import Path
-from typing import Optional
+from typing import Optional, List
 import httpx
 import pvporcupine
 import pyaudio
 import webrtcvad
-from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 # Configure logging
@@ -44,74 +48,59 @@ logger = logging.getLogger("headmic")
 # Configuration
 # ============================================================================
 # Porcupine wake word
 PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "")
 WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn")
 # Audio settings
 SAMPLE_RATE = 16000
-CHANNELS = 1  # Mono for transcription (pick channel 0 from 4-mic array)
+ALSA_DEVICE = "plughw:3,0"  # ReSpeaker 4 Mic Array - card 3, device 0
 FRAME_LENGTH = 512  # Porcupine frame length
-# VAD settings
+VAD_AGGRESSIVENESS = 2  # 0-3, higher = more aggressive
-VAD_AGGRESSIVENESS = 3  # 0-3, higher = more aggressive filtering
+SILENCE_FRAMES = 50  # ~1.5 sec of silence to stop (at 30ms frames)
-SILENCE_THRESHOLD_MS = 1500  # Stop recording after this much silence
+MAX_RECORDING_FRAMES = 1000  # ~30 sec max
 MAX_RECORDING_SEC = 30  # Maximum recording duration
 # EarTail
 EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764")
 # ReSpeaker LED control
 LED_ENABLED = True
 # ============================================================================
-# LED Control (ReSpeaker 4-mic array has 12 APA102 LEDs)
+# LED Control
 # ============================================================================
 try:
    from pixel_ring import pixel_ring
-    PIXEL_RING_AVAILABLE = True
+    LEDS_AVAILABLE = True
    pixel_ring.off()
 except ImportError:
-    PIXEL_RING_AVAILABLE = False
+    LEDS_AVAILABLE = False
-    logger.warning("pixel_ring not available - LED feedback disabled")
+    logger.warning("pixel_ring not available")
 def leds_listening():
    """Set LEDs to listening state (cyan spin)."""
    if PIXEL_RING_AVAILABLE and LED_ENABLED:
        try:
            pixel_ring.set_color_palette(0x00FFFF, 0x000000)  # Cyan
            pixel_ring.think()
        except Exception as e:
            logger.warning(f"LED error: {e}")
 def leds_processing():
    """Set LEDs to processing state (purple pulse)."""
    if PIXEL_RING_AVAILABLE and LED_ENABLED:
        try:
            pixel_ring.set_color_palette(0x9400D3, 0x000000)  # Purple
            pixel_ring.spin()
        except Exception as e:
            logger.warning(f"LED error: {e}")
 def leds_off():
    """Turn off LEDs."""
    if PIXEL_RING_AVAILABLE and LED_ENABLED:
        try:
            pixel_ring.off()
        except Exception as e:
            logger.warning(f"LED error: {e}")
 def leds_wakeup():
-    """Flash LEDs on wake word detection."""
+    if LEDS_AVAILABLE:
    if PIXEL_RING_AVAILABLE and LED_ENABLED:
        try:
            pixel_ring.wakeup()
-        except Exception as e:
+        except: pass
-            logger.warning(f"LED error: {e}")
+
 def leds_listening():
    if LEDS_AVAILABLE:
        try:
            pixel_ring.set_color_palette(0x00FFFF, 0x000000)
            pixel_ring.think()
        except: pass
 def leds_processing():
    if LEDS_AVAILABLE:
        try:
            pixel_ring.set_color_palette(0x9400D3, 0x000000)
            pixel_ring.spin()
        except: pass
 def leds_off():
    if LEDS_AVAILABLE:
        try:
            pixel_ring.off()
        except: pass
 # ============================================================================
@@ -120,268 +109,243 @@ def leds_wakeup():
 class ServiceState:
    def __init__(self):
        self.running = False
        self.listening = False
        self.recording = False
        self.processing = False
-        self.last_transcription = None
+        self.last_transcription: Optional[str] = None
-        self.last_wake_time = None
+        self.last_wake_time: Optional[float] = None
        self.wake_count = 0
-        self.porcupine = None
+        self.error: Optional[str] = None
        self.audio = None
        self.stream = None
        self.listener_thread = None
        self.running = False
 state = ServiceState()
 # ============================================================================
-# Audio Recording with VAD
+# Audio Stream using ALSA directly (arecord)
 # ============================================================================
-def record_until_silence(timeout_sec: float = MAX_RECORDING_SEC) -> bytes:
+def read_audio_stream():
    """
-    Record audio until silence is detected.
+    Generator that yields audio frames from ALSA using arecord.
-    Returns WAV data as bytes.
+    Each frame is 512 samples (32ms at 16kHz) as required by Porcupine.
    """
-    vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
+    frame_size = 512  # Porcupine requires 512 samples
    bytes_per_frame = frame_size * 2  # 16-bit = 2 bytes per sample
-    # VAD requires specific frame sizes: 10, 20, or 30 ms
+    cmd = [
-    frame_duration_ms = 30
+        "arecord",
-    frame_size = int(SAMPLE_RATE * frame_duration_ms / 1000)
+        "-D", ALSA_DEVICE,
        "-f", "S16_LE",
        "-r", str(SAMPLE_RATE),
        "-c", "1",  # Mono
        "-t", "raw",
        "-q",  # Quiet
        "-"
    ]
-    p = pyaudio.PyAudio()
+    logger.info(f"Starting audio stream: {' '.join(cmd)}")
-    # Find the ReSpeaker device
+    proc = subprocess.Popen(
-    device_index = None
+        cmd,
-    for i in range(p.get_device_count()):
+        stdout=subprocess.PIPE,
-        info = p.get_device_info_by_index(i)
+        stderr=subprocess.DEVNULL,
-        if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower():
+        bufsize=bytes_per_frame
            device_index = i
            break
    if device_index is None:
        # Fallback to default
        logger.warning("ReSpeaker not found, using default input")
    stream = p.open(
        format=pyaudio.paInt16,
        channels=4,  # ReSpeaker has 4 channels
        rate=SAMPLE_RATE,
        input=True,
        input_device_index=device_index,
        frames_per_buffer=frame_size
    )
    logger.info("Recording started...")
    frames = []
    silence_frames = 0
    silence_limit = int(SILENCE_THRESHOLD_MS / frame_duration_ms)
    max_frames = int(timeout_sec * 1000 / frame_duration_ms)
    try:
-        for _ in range(max_frames):
+        while state.running:
-            data = stream.read(frame_size, exception_on_overflow=False)
+            data = proc.stdout.read(bytes_per_frame)
-            
+            if len(data) < bytes_per_frame:
            # Extract channel 0 (mono) from 4-channel audio
            # Each sample is 2 bytes (int16), 4 channels = 8 bytes per frame
            mono_data = b''
            for i in range(0, len(data), 8):  # 8 bytes per sample set
                mono_data += data[i:i+2]  # Take first channel only
            frames.append(mono_data)
            # Check for speech
            is_speech = vad.is_speech(mono_data, SAMPLE_RATE)
            if is_speech:
                silence_frames = 0
            else:
                silence_frames += 1
            # Stop if enough silence after we've recorded something
            if len(frames) > 10 and silence_frames >= silence_limit:
                logger.info(f"Silence detected after {len(frames)} frames")
                break
-    
+            yield data
    finally:
-        stream.stop_stream()
+        proc.terminate()
-        stream.close()
+        proc.wait()
        p.terminate()
    # Convert to WAV
    wav_buffer = io.BytesIO()
    with wave.open(wav_buffer, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)  # 16-bit
        wf.setframerate(SAMPLE_RATE)
        wf.writeframes(b''.join(frames))
    wav_buffer.seek(0)
    return wav_buffer.read()
 # ============================================================================
-# EarTail Integration
+# EarTail Transcription
 # ============================================================================
 async def transcribe_audio(audio_data: bytes) -> str:
    """Send audio to EarTail and get transcription."""
    async with httpx.AsyncClient(timeout=120.0) as client:
        # Submit job
        files = {"audio": ("recording.wav", audio_data, "audio/wav")}
        response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files)
        response.raise_for_status()
        job_id = response.json().get("job_id")
-        logger.info(f"Transcription job submitted: {job_id}")
+        logger.info(f"Transcription job: {job_id}")
-        # Poll for completion
+        for _ in range(120):
-        for _ in range(60):  # Max 60 seconds
+            status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
-            status_response = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
+            data = status.json()
            status_data = status_response.json()
-            if status_data.get("status") == "SUCCESS":
+            if data.get("status") == "SUCCESS":
                result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}")
                return result.json().get("transcription", "")
-            elif status_data.get("status") == "FAILURE":
+            elif data.get("status") == "FAILURE":
-                raise Exception(f"Transcription failed: {status_data.get('error')}")
+                raise Exception(f"Transcription failed: {data.get('error')}")
            await asyncio.sleep(1)
        raise Exception("Transcription timeout")
 def transcribe_sync(audio_data: bytes) -> str:
    """Synchronous wrapper for transcription."""
    loop = asyncio.new_event_loop()
    try:
        return loop.run_until_complete(transcribe_audio(audio_data))
    finally:
        loop.close()
 # ============================================================================
-# Wake Word Listener
+# Main Listener Loop
 # ============================================================================
-def wake_word_listener():
+def audio_to_wav(frames: List[bytes]) -> bytes:
-    """Background thread that listens for wake word."""
+    """Convert raw audio frames to WAV format."""
    wav_buffer = io.BytesIO()
    with wave.open(wav_buffer, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(SAMPLE_RATE)
        wf.writeframes(b''.join(frames))
    wav_buffer.seek(0)
    return wav_buffer.read()
 def listener_loop():
    """Main audio processing loop."""
    global state
-    logger.info("Starting wake word listener...")
+    logger.info("Initializing Porcupine...")
    try:
-        state.porcupine = pvporcupine.create(
+        porcupine = pvporcupine.create(
            access_key=PORCUPINE_ACCESS_KEY,
            keyword_paths=[WAKE_WORD_PATH]
        )
    except Exception as e:
-        logger.error(f"Failed to initialize Porcupine: {e}")
+        logger.error(f"Failed to init Porcupine: {e}")
        state.error = str(e)
        return
-    state.audio = pyaudio.PyAudio()
+    vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
-    # Find ReSpeaker device
+    # VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples
-    device_index = None
+    # Porcupine needs 512 samples. We'll use 480 for VAD.
-    for i in range(state.audio.get_device_count()):
+    vad_frame_size = 480
-        info = state.audio.get_device_info_by_index(i)
+    vad_frame_bytes = vad_frame_size * 2
        if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower():
            device_index = i
            break
    state.stream = state.audio.open(
        rate=state.porcupine.sample_rate,
        channels=1,
        format=pyaudio.paInt16,
        input=True,
        input_device_index=device_index,
        frames_per_buffer=state.porcupine.frame_length
    )
    state.listening = True
-    logger.info("Wake word listener active - say 'Hey Vivi'!")
+    logger.info("🦊 Wake word listener active - say 'Hey Vivi'!")
-    while state.running:
+    recording_buffer: List[bytes] = []
-        try:
+    silence_count = 0
-            pcm = state.stream.read(state.porcupine.frame_length, exception_on_overflow=False)
+    is_recording = False
-            pcm = struct.unpack_from("h" * state.porcupine.frame_length, pcm)
+    
    try:
        for frame_data in read_audio_stream():
            if not state.running:
                break
-            keyword_index = state.porcupine.process(pcm)
+            # Convert bytes to int16 array for Porcupine
            pcm = struct.unpack_from("h" * 512, frame_data)
-            if keyword_index >= 0:
+            # Check for wake word
            keyword_index = porcupine.process(pcm)
            if keyword_index >= 0 and not is_recording:
                logger.info("🦊 Wake word detected: 'Hey Vivi'!")
                state.wake_count += 1
                state.last_wake_time = time.time()
                # Visual feedback
                leds_wakeup()
-                time.sleep(0.3)
+                time.sleep(0.2)
                leds_listening()
-                # Record and transcribe
+                is_recording = True
                state.recording = True
                recording_buffer = []
                silence_count = 0
                logger.info("Recording started...")
                continue
            if is_recording:
                recording_buffer.append(frame_data)
                # Check VAD (use first 480 samples of the 512 frame)
                vad_data = frame_data[:vad_frame_bytes]
                try:
-                    audio_data = record_until_silence()
+                    is_speech = vad.is_speech(vad_data, SAMPLE_RATE)
                except:
                    is_speech = True  # Assume speech on VAD error
                if is_speech:
                    silence_count = 0
                else:
                    silence_count += 1
                # Stop conditions
                should_stop = (
                    (len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or
                    len(recording_buffer) >= MAX_RECORDING_FRAMES
                )
                if should_stop:
                    logger.info(f"Recording stopped: {len(recording_buffer)} frames")
                    is_recording = False
                    state.recording = False
                    leds_processing()
                    state.recording = False
                    state.processing = True
-                    # Transcribe (run in asyncio)
+                    try:
-                    loop = asyncio.new_event_loop()
+                        wav_data = audio_to_wav(recording_buffer)
-                    transcription = loop.run_until_complete(transcribe_audio(audio_data))
+                        transcription = transcribe_sync(wav_data)
-                    loop.close()
+                        state.last_transcription = transcription
                        logger.info(f"Transcription: {transcription}")
                    except Exception as e:
                        logger.error(f"Transcription error: {e}")
                        state.error = str(e)
                    finally:
                        state.processing = False
                        leds_off()
-                    state.last_transcription = transcription
+                    recording_buffer = []
                    logger.info(f"Transcription: {transcription}")
                except Exception as e:
                    logger.error(f"Recording/transcription error: {e}")
                finally:
                    state.recording = False
                    state.processing = False
                    leds_off()
        except Exception as e:
            logger.error(f"Listener error: {e}")
            time.sleep(0.1)
-    # Cleanup
+    except Exception as e:
-    if state.stream:
+        logger.error(f"Listener error: {e}")
-        state.stream.close()
+        state.error = str(e)
-    if state.audio:
+    finally:
-        state.audio.terminate()
+        porcupine.delete()
-    if state.porcupine:
+        state.listening = False
-        state.porcupine.delete()
+        leds_off()
-    
+        logger.info("Listener stopped")
    state.listening = False
    logger.info("Wake word listener stopped")
 # ============================================================================
-# FastAPI App
+# FastAPI
 # ============================================================================
-app = FastAPI(title="HeadMic", description="Vixy's Ears - Wake Word + Voice Recording 🦊👂")
+app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂")
 class RecordRequest(BaseModel):
    duration_sec: float = 5.0
 class TranscribeResponse(BaseModel):
    transcription: str
    duration_sec: float
@app.on_event("startup")
 async def startup():
    """Start the wake word listener on startup."""
    state.running = True
-    state.listener_thread = threading.Thread(target=wake_word_listener, daemon=True)
+    thread = threading.Thread(target=listener_loop, daemon=True)
-    state.listener_thread.start()
+    thread.start()
-    logger.info("HeadMic service started")
+    logger.info("HeadMic started")
@app.on_event("shutdown")
 async def shutdown():
    """Stop the wake word listener on shutdown."""
    state.running = False
    leds_off()
    if state.listener_thread:
        state.listener_thread.join(timeout=5)
    logger.info("HeadMic service stopped")
@app.get("/")
@@ -389,21 +353,19 @@ async def root():
    return {
        "service": "HeadMic",
        "description": "Vixy's Ears 🦊👂",
-        "wake_word": "Hey Vivi",
+        "wake_word": "Hey Vivi"
        "status": "listening" if state.listening else "idle"
    }
@app.get("/health")
 async def health():
    return {
-        "healthy": state.listening,
+        "healthy": state.listening and not state.error,
        "listening": state.listening,
        "recording": state.recording,
        "processing": state.processing,
        "wake_count": state.wake_count,
-        "porcupine_loaded": state.porcupine is not None,
+        "error": state.error
        "eartail_url": EARTAIL_URL
    }
@@ -415,89 +377,13 @@ async def status():
        "processing": state.processing,
        "last_transcription": state.last_transcription,
        "last_wake_time": state.last_wake_time,
-        "wake_count": state.wake_count
+        "wake_count": state.wake_count,
        "error": state.error
    }
@app.post("/record")
 async def record(request: RecordRequest):
    """Manually record for a specified duration."""
    if state.recording:
        raise HTTPException(status_code=409, detail="Already recording")
    state.recording = True
    leds_listening()
    try:
        # Simple timed recording (not VAD-based)
        p = pyaudio.PyAudio()
        frames = []
        stream = p.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=SAMPLE_RATE,
            input=True,
            frames_per_buffer=1024
        )
        for _ in range(int(SAMPLE_RATE / 1024 * request.duration_sec)):
            data = stream.read(1024)
            frames.append(data)
        stream.stop_stream()
        stream.close()
        p.terminate()
        # Convert to WAV
        wav_buffer = io.BytesIO()
        with wave.open(wav_buffer, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(SAMPLE_RATE)
            wf.writeframes(b''.join(frames))
        wav_buffer.seek(0)
        return {"success": True, "size_bytes": len(wav_buffer.getvalue())}
    finally:
        state.recording = False
        leds_off()
@app.post("/transcribe")
 async def transcribe_endpoint(request: RecordRequest):
    """Record and transcribe."""
    if state.recording or state.processing:
        raise HTTPException(status_code=409, detail="Busy")
    state.recording = True
    leds_listening()
    try:
        start = time.time()
        audio_data = record_until_silence(timeout_sec=request.duration_sec)
        leds_processing()
        state.recording = False
        state.processing = True
        transcription = await transcribe_audio(audio_data)
        duration = time.time() - start
        state.last_transcription = transcription
        return TranscribeResponse(transcription=transcription, duration_sec=duration)
    finally:
        state.recording = False
        state.processing = False
        leds_off()
@app.get("/last")
-async def last_transcription():
+async def last():
    """Get the last transcription."""
    return {
        "transcription": state.last_transcription,
        "wake_time": state.last_wake_time