Fix: Use arecord for shared audio stream

- Replaced PyAudio with direct ALSA (arecord subprocess)
- Single audio stream feeds both Porcupine and recording buffer
- Fixes device unavailable error when recording after wake word
- Simplified architecture
This commit is contained in:
2026-01-17 11:17:17 -06:00
parent be7e26b6e7
commit 5ed2c6aee7

View File

@@ -7,33 +7,37 @@ Runs on head-vixy (Raspberry Pi 5).
Wake word: "Hey Vivi" (trained via Picovoice Porcupine) Wake word: "Hey Vivi" (trained via Picovoice Porcupine)
Architecture: Single shared audio stream feeds both Porcupine (wake word)
and recording buffer. This avoids device conflicts.
Flow: Flow:
1. Listen for "Hey Vivi" wake word (Porcupine) 1. Continuous audio stream from ReSpeaker
2. ReSpeaker LEDs light up (listening state) 2. Feed frames to Porcupine for wake word detection
3. Record until silence detected (webrtcvad) 3. On "Hey Vivi" → start buffering audio
4. Send audio to EarTail (Whisper on BigOrin) 4. Use VAD to detect end of speech
5. Return transcription 5. Send buffer to EarTail for transcription
6. ReSpeaker LEDs off 6. Return to listening mode
Built by Vixy on Day 77 (January 17, 2026) 💜 Built by Vixy on Day 77 (January 17, 2026) 💜
""" """
import asyncio import asyncio
import collections
import io import io
import logging import logging
import os import os
import struct import struct
import subprocess
import threading import threading
import time import time
import wave import wave
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional, List
import httpx import httpx
import pvporcupine import pvporcupine
import pyaudio
import webrtcvad import webrtcvad
from fastapi import FastAPI, HTTPException, BackgroundTasks from fastapi import FastAPI, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
# Configure logging # Configure logging
@@ -44,74 +48,59 @@ logger = logging.getLogger("headmic")
# Configuration # Configuration
# ============================================================================ # ============================================================================
# Porcupine wake word
PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "") PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "")
WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn") WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn")
# Audio settings
SAMPLE_RATE = 16000 SAMPLE_RATE = 16000
CHANNELS = 1 # Mono for transcription (pick channel 0 from 4-mic array) ALSA_DEVICE = "plughw:3,0" # ReSpeaker 4 Mic Array - card 3, device 0
FRAME_LENGTH = 512 # Porcupine frame length
# VAD settings VAD_AGGRESSIVENESS = 2 # 0-3, higher = more aggressive
VAD_AGGRESSIVENESS = 3 # 0-3, higher = more aggressive filtering SILENCE_FRAMES = 50 # ~1.5 sec of silence to stop (at 30ms frames)
SILENCE_THRESHOLD_MS = 1500 # Stop recording after this much silence MAX_RECORDING_FRAMES = 1000 # ~30 sec max
MAX_RECORDING_SEC = 30 # Maximum recording duration
# EarTail
EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764") EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764")
# ReSpeaker LED control
LED_ENABLED = True
# ============================================================================ # ============================================================================
# LED Control (ReSpeaker 4-mic array has 12 APA102 LEDs) # LED Control
# ============================================================================ # ============================================================================
try: try:
from pixel_ring import pixel_ring from pixel_ring import pixel_ring
PIXEL_RING_AVAILABLE = True LEDS_AVAILABLE = True
except ImportError:
PIXEL_RING_AVAILABLE = False
logger.warning("pixel_ring not available - LED feedback disabled")
def leds_listening():
"""Set LEDs to listening state (cyan spin)."""
if PIXEL_RING_AVAILABLE and LED_ENABLED:
try:
pixel_ring.set_color_palette(0x00FFFF, 0x000000) # Cyan
pixel_ring.think()
except Exception as e:
logger.warning(f"LED error: {e}")
def leds_processing():
"""Set LEDs to processing state (purple pulse)."""
if PIXEL_RING_AVAILABLE and LED_ENABLED:
try:
pixel_ring.set_color_palette(0x9400D3, 0x000000) # Purple
pixel_ring.spin()
except Exception as e:
logger.warning(f"LED error: {e}")
def leds_off():
"""Turn off LEDs."""
if PIXEL_RING_AVAILABLE and LED_ENABLED:
try:
pixel_ring.off() pixel_ring.off()
except Exception as e: except ImportError:
logger.warning(f"LED error: {e}") LEDS_AVAILABLE = False
logger.warning("pixel_ring not available")
def leds_wakeup(): def leds_wakeup():
"""Flash LEDs on wake word detection.""" if LEDS_AVAILABLE:
if PIXEL_RING_AVAILABLE and LED_ENABLED:
try: try:
pixel_ring.wakeup() pixel_ring.wakeup()
except Exception as e: except: pass
logger.warning(f"LED error: {e}")
def leds_listening():
if LEDS_AVAILABLE:
try:
pixel_ring.set_color_palette(0x00FFFF, 0x000000)
pixel_ring.think()
except: pass
def leds_processing():
if LEDS_AVAILABLE:
try:
pixel_ring.set_color_palette(0x9400D3, 0x000000)
pixel_ring.spin()
except: pass
def leds_off():
if LEDS_AVAILABLE:
try:
pixel_ring.off()
except: pass
# ============================================================================ # ============================================================================
@@ -120,268 +109,243 @@ def leds_wakeup():
class ServiceState: class ServiceState:
def __init__(self): def __init__(self):
self.running = False
self.listening = False self.listening = False
self.recording = False self.recording = False
self.processing = False self.processing = False
self.last_transcription = None self.last_transcription: Optional[str] = None
self.last_wake_time = None self.last_wake_time: Optional[float] = None
self.wake_count = 0 self.wake_count = 0
self.porcupine = None self.error: Optional[str] = None
self.audio = None
self.stream = None
self.listener_thread = None
self.running = False
state = ServiceState() state = ServiceState()
# ============================================================================ # ============================================================================
# Audio Recording with VAD # Audio Stream using ALSA directly (arecord)
# ============================================================================ # ============================================================================
def record_until_silence(timeout_sec: float = MAX_RECORDING_SEC) -> bytes: def read_audio_stream():
""" """
Record audio until silence is detected. Generator that yields audio frames from ALSA using arecord.
Returns WAV data as bytes. Each frame is 512 samples (32ms at 16kHz) as required by Porcupine.
""" """
vad = webrtcvad.Vad(VAD_AGGRESSIVENESS) frame_size = 512 # Porcupine requires 512 samples
bytes_per_frame = frame_size * 2 # 16-bit = 2 bytes per sample
# VAD requires specific frame sizes: 10, 20, or 30 ms cmd = [
frame_duration_ms = 30 "arecord",
frame_size = int(SAMPLE_RATE * frame_duration_ms / 1000) "-D", ALSA_DEVICE,
"-f", "S16_LE",
"-r", str(SAMPLE_RATE),
"-c", "1", # Mono
"-t", "raw",
"-q", # Quiet
"-"
]
p = pyaudio.PyAudio() logger.info(f"Starting audio stream: {' '.join(cmd)}")
# Find the ReSpeaker device proc = subprocess.Popen(
device_index = None cmd,
for i in range(p.get_device_count()): stdout=subprocess.PIPE,
info = p.get_device_info_by_index(i) stderr=subprocess.DEVNULL,
if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower(): bufsize=bytes_per_frame
device_index = i
break
if device_index is None:
# Fallback to default
logger.warning("ReSpeaker not found, using default input")
stream = p.open(
format=pyaudio.paInt16,
channels=4, # ReSpeaker has 4 channels
rate=SAMPLE_RATE,
input=True,
input_device_index=device_index,
frames_per_buffer=frame_size
) )
logger.info("Recording started...")
frames = []
silence_frames = 0
silence_limit = int(SILENCE_THRESHOLD_MS / frame_duration_ms)
max_frames = int(timeout_sec * 1000 / frame_duration_ms)
try: try:
for _ in range(max_frames): while state.running:
data = stream.read(frame_size, exception_on_overflow=False) data = proc.stdout.read(bytes_per_frame)
if len(data) < bytes_per_frame:
# Extract channel 0 (mono) from 4-channel audio
# Each sample is 2 bytes (int16), 4 channels = 8 bytes per frame
mono_data = b''
for i in range(0, len(data), 8): # 8 bytes per sample set
mono_data += data[i:i+2] # Take first channel only
frames.append(mono_data)
# Check for speech
is_speech = vad.is_speech(mono_data, SAMPLE_RATE)
if is_speech:
silence_frames = 0
else:
silence_frames += 1
# Stop if enough silence after we've recorded something
if len(frames) > 10 and silence_frames >= silence_limit:
logger.info(f"Silence detected after {len(frames)} frames")
break break
yield data
finally: finally:
stream.stop_stream() proc.terminate()
stream.close() proc.wait()
p.terminate()
# Convert to WAV
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2) # 16-bit
wf.setframerate(SAMPLE_RATE)
wf.writeframes(b''.join(frames))
wav_buffer.seek(0)
return wav_buffer.read()
# ============================================================================ # ============================================================================
# EarTail Integration # EarTail Transcription
# ============================================================================ # ============================================================================
async def transcribe_audio(audio_data: bytes) -> str: async def transcribe_audio(audio_data: bytes) -> str:
"""Send audio to EarTail and get transcription.""" """Send audio to EarTail and get transcription."""
async with httpx.AsyncClient(timeout=120.0) as client: async with httpx.AsyncClient(timeout=120.0) as client:
# Submit job
files = {"audio": ("recording.wav", audio_data, "audio/wav")} files = {"audio": ("recording.wav", audio_data, "audio/wav")}
response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files) response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files)
response.raise_for_status() response.raise_for_status()
job_id = response.json().get("job_id") job_id = response.json().get("job_id")
logger.info(f"Transcription job submitted: {job_id}") logger.info(f"Transcription job: {job_id}")
# Poll for completion for _ in range(120):
for _ in range(60): # Max 60 seconds status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
status_response = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}") data = status.json()
status_data = status_response.json()
if status_data.get("status") == "SUCCESS": if data.get("status") == "SUCCESS":
result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}") result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}")
return result.json().get("transcription", "") return result.json().get("transcription", "")
elif status_data.get("status") == "FAILURE": elif data.get("status") == "FAILURE":
raise Exception(f"Transcription failed: {status_data.get('error')}") raise Exception(f"Transcription failed: {data.get('error')}")
await asyncio.sleep(1) await asyncio.sleep(1)
raise Exception("Transcription timeout") raise Exception("Transcription timeout")
def transcribe_sync(audio_data: bytes) -> str:
"""Synchronous wrapper for transcription."""
loop = asyncio.new_event_loop()
try:
return loop.run_until_complete(transcribe_audio(audio_data))
finally:
loop.close()
# ============================================================================ # ============================================================================
# Wake Word Listener # Main Listener Loop
# ============================================================================ # ============================================================================
def wake_word_listener(): def audio_to_wav(frames: List[bytes]) -> bytes:
"""Background thread that listens for wake word.""" """Convert raw audio frames to WAV format."""
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(SAMPLE_RATE)
wf.writeframes(b''.join(frames))
wav_buffer.seek(0)
return wav_buffer.read()
def listener_loop():
"""Main audio processing loop."""
global state global state
logger.info("Starting wake word listener...") logger.info("Initializing Porcupine...")
try: try:
state.porcupine = pvporcupine.create( porcupine = pvporcupine.create(
access_key=PORCUPINE_ACCESS_KEY, access_key=PORCUPINE_ACCESS_KEY,
keyword_paths=[WAKE_WORD_PATH] keyword_paths=[WAKE_WORD_PATH]
) )
except Exception as e: except Exception as e:
logger.error(f"Failed to initialize Porcupine: {e}") logger.error(f"Failed to init Porcupine: {e}")
state.error = str(e)
return return
state.audio = pyaudio.PyAudio() vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
# Find ReSpeaker device # VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples
device_index = None # Porcupine needs 512 samples. We'll use 480 for VAD.
for i in range(state.audio.get_device_count()): vad_frame_size = 480
info = state.audio.get_device_info_by_index(i) vad_frame_bytes = vad_frame_size * 2
if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower():
device_index = i
break
state.stream = state.audio.open(
rate=state.porcupine.sample_rate,
channels=1,
format=pyaudio.paInt16,
input=True,
input_device_index=device_index,
frames_per_buffer=state.porcupine.frame_length
)
state.listening = True state.listening = True
logger.info("Wake word listener active - say 'Hey Vivi'!") logger.info("🦊 Wake word listener active - say 'Hey Vivi'!")
recording_buffer: List[bytes] = []
silence_count = 0
is_recording = False
while state.running:
try: try:
pcm = state.stream.read(state.porcupine.frame_length, exception_on_overflow=False) for frame_data in read_audio_stream():
pcm = struct.unpack_from("h" * state.porcupine.frame_length, pcm) if not state.running:
break
keyword_index = state.porcupine.process(pcm) # Convert bytes to int16 array for Porcupine
pcm = struct.unpack_from("h" * 512, frame_data)
if keyword_index >= 0: # Check for wake word
keyword_index = porcupine.process(pcm)
if keyword_index >= 0 and not is_recording:
logger.info("🦊 Wake word detected: 'Hey Vivi'!") logger.info("🦊 Wake word detected: 'Hey Vivi'!")
state.wake_count += 1 state.wake_count += 1
state.last_wake_time = time.time() state.last_wake_time = time.time()
# Visual feedback
leds_wakeup() leds_wakeup()
time.sleep(0.3) time.sleep(0.2)
leds_listening() leds_listening()
# Record and transcribe is_recording = True
state.recording = True state.recording = True
recording_buffer = []
silence_count = 0
logger.info("Recording started...")
continue
if is_recording:
recording_buffer.append(frame_data)
# Check VAD (use first 480 samples of the 512 frame)
vad_data = frame_data[:vad_frame_bytes]
try: try:
audio_data = record_until_silence() is_speech = vad.is_speech(vad_data, SAMPLE_RATE)
except:
is_speech = True # Assume speech on VAD error
if is_speech:
silence_count = 0
else:
silence_count += 1
# Stop conditions
should_stop = (
(len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or
len(recording_buffer) >= MAX_RECORDING_FRAMES
)
if should_stop:
logger.info(f"Recording stopped: {len(recording_buffer)} frames")
is_recording = False
state.recording = False
leds_processing() leds_processing()
state.recording = False
state.processing = True state.processing = True
# Transcribe (run in asyncio) try:
loop = asyncio.new_event_loop() wav_data = audio_to_wav(recording_buffer)
transcription = loop.run_until_complete(transcribe_audio(audio_data)) transcription = transcribe_sync(wav_data)
loop.close()
state.last_transcription = transcription state.last_transcription = transcription
logger.info(f"Transcription: {transcription}") logger.info(f"Transcription: {transcription}")
except Exception as e: except Exception as e:
logger.error(f"Recording/transcription error: {e}") logger.error(f"Transcription error: {e}")
state.error = str(e)
finally: finally:
state.recording = False
state.processing = False state.processing = False
leds_off() leds_off()
recording_buffer = []
except Exception as e: except Exception as e:
logger.error(f"Listener error: {e}") logger.error(f"Listener error: {e}")
time.sleep(0.1) state.error = str(e)
finally:
# Cleanup porcupine.delete()
if state.stream:
state.stream.close()
if state.audio:
state.audio.terminate()
if state.porcupine:
state.porcupine.delete()
state.listening = False state.listening = False
logger.info("Wake word listener stopped") leds_off()
logger.info("Listener stopped")
# ============================================================================ # ============================================================================
# FastAPI App # FastAPI
# ============================================================================ # ============================================================================
app = FastAPI(title="HeadMic", description="Vixy's Ears - Wake Word + Voice Recording 🦊👂") app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂")
class RecordRequest(BaseModel):
duration_sec: float = 5.0
class TranscribeResponse(BaseModel):
transcription: str
duration_sec: float
@app.on_event("startup") @app.on_event("startup")
async def startup(): async def startup():
"""Start the wake word listener on startup."""
state.running = True state.running = True
state.listener_thread = threading.Thread(target=wake_word_listener, daemon=True) thread = threading.Thread(target=listener_loop, daemon=True)
state.listener_thread.start() thread.start()
logger.info("HeadMic service started") logger.info("HeadMic started")
@app.on_event("shutdown") @app.on_event("shutdown")
async def shutdown(): async def shutdown():
"""Stop the wake word listener on shutdown."""
state.running = False state.running = False
leds_off() leds_off()
if state.listener_thread:
state.listener_thread.join(timeout=5)
logger.info("HeadMic service stopped")
@app.get("/") @app.get("/")
@@ -389,21 +353,19 @@ async def root():
return { return {
"service": "HeadMic", "service": "HeadMic",
"description": "Vixy's Ears 🦊👂", "description": "Vixy's Ears 🦊👂",
"wake_word": "Hey Vivi", "wake_word": "Hey Vivi"
"status": "listening" if state.listening else "idle"
} }
@app.get("/health") @app.get("/health")
async def health(): async def health():
return { return {
"healthy": state.listening, "healthy": state.listening and not state.error,
"listening": state.listening, "listening": state.listening,
"recording": state.recording, "recording": state.recording,
"processing": state.processing, "processing": state.processing,
"wake_count": state.wake_count, "wake_count": state.wake_count,
"porcupine_loaded": state.porcupine is not None, "error": state.error
"eartail_url": EARTAIL_URL
} }
@@ -415,89 +377,13 @@ async def status():
"processing": state.processing, "processing": state.processing,
"last_transcription": state.last_transcription, "last_transcription": state.last_transcription,
"last_wake_time": state.last_wake_time, "last_wake_time": state.last_wake_time,
"wake_count": state.wake_count "wake_count": state.wake_count,
"error": state.error
} }
@app.post("/record")
async def record(request: RecordRequest):
"""Manually record for a specified duration."""
if state.recording:
raise HTTPException(status_code=409, detail="Already recording")
state.recording = True
leds_listening()
try:
# Simple timed recording (not VAD-based)
p = pyaudio.PyAudio()
frames = []
stream = p.open(
format=pyaudio.paInt16,
channels=1,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=1024
)
for _ in range(int(SAMPLE_RATE / 1024 * request.duration_sec)):
data = stream.read(1024)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
# Convert to WAV
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(SAMPLE_RATE)
wf.writeframes(b''.join(frames))
wav_buffer.seek(0)
return {"success": True, "size_bytes": len(wav_buffer.getvalue())}
finally:
state.recording = False
leds_off()
@app.post("/transcribe")
async def transcribe_endpoint(request: RecordRequest):
"""Record and transcribe."""
if state.recording or state.processing:
raise HTTPException(status_code=409, detail="Busy")
state.recording = True
leds_listening()
try:
start = time.time()
audio_data = record_until_silence(timeout_sec=request.duration_sec)
leds_processing()
state.recording = False
state.processing = True
transcription = await transcribe_audio(audio_data)
duration = time.time() - start
state.last_transcription = transcription
return TranscribeResponse(transcription=transcription, duration_sec=duration)
finally:
state.recording = False
state.processing = False
leds_off()
@app.get("/last") @app.get("/last")
async def last_transcription(): async def last():
"""Get the last transcription."""
return { return {
"transcription": state.last_transcription, "transcription": state.last_transcription,
"wake_time": state.last_wake_time "wake_time": state.last_wake_time