Fix: Use arecord for shared audio stream
- Replaced PyAudio with direct ALSA (arecord subprocess) - Single audio stream feeds both Porcupine and recording buffer - Fixes device unavailable error when recording after wake word - Simplified architecture
This commit is contained in:
510
headmic.py
510
headmic.py
@@ -7,33 +7,37 @@ Runs on head-vixy (Raspberry Pi 5).
|
||||
|
||||
Wake word: "Hey Vivi" (trained via Picovoice Porcupine)
|
||||
|
||||
Architecture: Single shared audio stream feeds both Porcupine (wake word)
|
||||
and recording buffer. This avoids device conflicts.
|
||||
|
||||
Flow:
|
||||
1. Listen for "Hey Vivi" wake word (Porcupine)
|
||||
2. ReSpeaker LEDs light up (listening state)
|
||||
3. Record until silence detected (webrtcvad)
|
||||
4. Send audio to EarTail (Whisper on BigOrin)
|
||||
5. Return transcription
|
||||
6. ReSpeaker LEDs off
|
||||
1. Continuous audio stream from ReSpeaker
|
||||
2. Feed frames to Porcupine for wake word detection
|
||||
3. On "Hey Vivi" → start buffering audio
|
||||
4. Use VAD to detect end of speech
|
||||
5. Send buffer to EarTail for transcription
|
||||
6. Return to listening mode
|
||||
|
||||
Built by Vixy on Day 77 (January 17, 2026) 💜
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import collections
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import struct
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import wave
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
|
||||
import httpx
|
||||
import pvporcupine
|
||||
import pyaudio
|
||||
import webrtcvad
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Configure logging
|
||||
@@ -44,74 +48,59 @@ logger = logging.getLogger("headmic")
|
||||
# Configuration
|
||||
# ============================================================================
|
||||
|
||||
# Porcupine wake word
|
||||
PORCUPINE_ACCESS_KEY = os.environ.get("PORCUPINE_ACCESS_KEY", "")
|
||||
WAKE_WORD_PATH = os.environ.get("WAKE_WORD_PATH", "/home/alex/headmic/Hey-Vivi_en_raspberry-pi_v4_0_0.ppn")
|
||||
|
||||
# Audio settings
|
||||
SAMPLE_RATE = 16000
|
||||
CHANNELS = 1 # Mono for transcription (pick channel 0 from 4-mic array)
|
||||
FRAME_LENGTH = 512 # Porcupine frame length
|
||||
ALSA_DEVICE = "plughw:3,0" # ReSpeaker 4 Mic Array - card 3, device 0
|
||||
|
||||
# VAD settings
|
||||
VAD_AGGRESSIVENESS = 3 # 0-3, higher = more aggressive filtering
|
||||
SILENCE_THRESHOLD_MS = 1500 # Stop recording after this much silence
|
||||
MAX_RECORDING_SEC = 30 # Maximum recording duration
|
||||
VAD_AGGRESSIVENESS = 2 # 0-3, higher = more aggressive
|
||||
SILENCE_FRAMES = 50 # ~1.5 sec of silence to stop (at 30ms frames)
|
||||
MAX_RECORDING_FRAMES = 1000 # ~30 sec max
|
||||
|
||||
# EarTail
|
||||
EARTAIL_URL = os.environ.get("EARTAIL_URL", "http://bigorin.local:8764")
|
||||
|
||||
# ReSpeaker LED control
|
||||
LED_ENABLED = True
|
||||
|
||||
# ============================================================================
|
||||
# LED Control (ReSpeaker 4-mic array has 12 APA102 LEDs)
|
||||
# LED Control
|
||||
# ============================================================================
|
||||
|
||||
try:
|
||||
from pixel_ring import pixel_ring
|
||||
PIXEL_RING_AVAILABLE = True
|
||||
LEDS_AVAILABLE = True
|
||||
pixel_ring.off()
|
||||
except ImportError:
|
||||
PIXEL_RING_AVAILABLE = False
|
||||
logger.warning("pixel_ring not available - LED feedback disabled")
|
||||
|
||||
|
||||
def leds_listening():
|
||||
"""Set LEDs to listening state (cyan spin)."""
|
||||
if PIXEL_RING_AVAILABLE and LED_ENABLED:
|
||||
try:
|
||||
pixel_ring.set_color_palette(0x00FFFF, 0x000000) # Cyan
|
||||
pixel_ring.think()
|
||||
except Exception as e:
|
||||
logger.warning(f"LED error: {e}")
|
||||
|
||||
|
||||
def leds_processing():
|
||||
"""Set LEDs to processing state (purple pulse)."""
|
||||
if PIXEL_RING_AVAILABLE and LED_ENABLED:
|
||||
try:
|
||||
pixel_ring.set_color_palette(0x9400D3, 0x000000) # Purple
|
||||
pixel_ring.spin()
|
||||
except Exception as e:
|
||||
logger.warning(f"LED error: {e}")
|
||||
|
||||
|
||||
def leds_off():
|
||||
"""Turn off LEDs."""
|
||||
if PIXEL_RING_AVAILABLE and LED_ENABLED:
|
||||
try:
|
||||
pixel_ring.off()
|
||||
except Exception as e:
|
||||
logger.warning(f"LED error: {e}")
|
||||
LEDS_AVAILABLE = False
|
||||
logger.warning("pixel_ring not available")
|
||||
|
||||
|
||||
def leds_wakeup():
|
||||
"""Flash LEDs on wake word detection."""
|
||||
if PIXEL_RING_AVAILABLE and LED_ENABLED:
|
||||
if LEDS_AVAILABLE:
|
||||
try:
|
||||
pixel_ring.wakeup()
|
||||
except Exception as e:
|
||||
logger.warning(f"LED error: {e}")
|
||||
except: pass
|
||||
|
||||
|
||||
def leds_listening():
|
||||
if LEDS_AVAILABLE:
|
||||
try:
|
||||
pixel_ring.set_color_palette(0x00FFFF, 0x000000)
|
||||
pixel_ring.think()
|
||||
except: pass
|
||||
|
||||
|
||||
def leds_processing():
|
||||
if LEDS_AVAILABLE:
|
||||
try:
|
||||
pixel_ring.set_color_palette(0x9400D3, 0x000000)
|
||||
pixel_ring.spin()
|
||||
except: pass
|
||||
|
||||
|
||||
def leds_off():
|
||||
if LEDS_AVAILABLE:
|
||||
try:
|
||||
pixel_ring.off()
|
||||
except: pass
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@@ -120,268 +109,243 @@ def leds_wakeup():
|
||||
|
||||
class ServiceState:
|
||||
def __init__(self):
|
||||
self.running = False
|
||||
self.listening = False
|
||||
self.recording = False
|
||||
self.processing = False
|
||||
self.last_transcription = None
|
||||
self.last_wake_time = None
|
||||
self.last_transcription: Optional[str] = None
|
||||
self.last_wake_time: Optional[float] = None
|
||||
self.wake_count = 0
|
||||
self.porcupine = None
|
||||
self.audio = None
|
||||
self.stream = None
|
||||
self.listener_thread = None
|
||||
self.running = False
|
||||
self.error: Optional[str] = None
|
||||
|
||||
state = ServiceState()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Audio Recording with VAD
|
||||
# Audio Stream using ALSA directly (arecord)
|
||||
# ============================================================================
|
||||
|
||||
def record_until_silence(timeout_sec: float = MAX_RECORDING_SEC) -> bytes:
|
||||
def read_audio_stream():
|
||||
"""
|
||||
Record audio until silence is detected.
|
||||
Returns WAV data as bytes.
|
||||
Generator that yields audio frames from ALSA using arecord.
|
||||
Each frame is 512 samples (32ms at 16kHz) as required by Porcupine.
|
||||
"""
|
||||
vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
|
||||
frame_size = 512 # Porcupine requires 512 samples
|
||||
bytes_per_frame = frame_size * 2 # 16-bit = 2 bytes per sample
|
||||
|
||||
# VAD requires specific frame sizes: 10, 20, or 30 ms
|
||||
frame_duration_ms = 30
|
||||
frame_size = int(SAMPLE_RATE * frame_duration_ms / 1000)
|
||||
cmd = [
|
||||
"arecord",
|
||||
"-D", ALSA_DEVICE,
|
||||
"-f", "S16_LE",
|
||||
"-r", str(SAMPLE_RATE),
|
||||
"-c", "1", # Mono
|
||||
"-t", "raw",
|
||||
"-q", # Quiet
|
||||
"-"
|
||||
]
|
||||
|
||||
p = pyaudio.PyAudio()
|
||||
logger.info(f"Starting audio stream: {' '.join(cmd)}")
|
||||
|
||||
# Find the ReSpeaker device
|
||||
device_index = None
|
||||
for i in range(p.get_device_count()):
|
||||
info = p.get_device_info_by_index(i)
|
||||
if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower():
|
||||
device_index = i
|
||||
break
|
||||
|
||||
if device_index is None:
|
||||
# Fallback to default
|
||||
logger.warning("ReSpeaker not found, using default input")
|
||||
|
||||
stream = p.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=4, # ReSpeaker has 4 channels
|
||||
rate=SAMPLE_RATE,
|
||||
input=True,
|
||||
input_device_index=device_index,
|
||||
frames_per_buffer=frame_size
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
bufsize=bytes_per_frame
|
||||
)
|
||||
|
||||
logger.info("Recording started...")
|
||||
frames = []
|
||||
silence_frames = 0
|
||||
silence_limit = int(SILENCE_THRESHOLD_MS / frame_duration_ms)
|
||||
max_frames = int(timeout_sec * 1000 / frame_duration_ms)
|
||||
|
||||
try:
|
||||
for _ in range(max_frames):
|
||||
data = stream.read(frame_size, exception_on_overflow=False)
|
||||
|
||||
# Extract channel 0 (mono) from 4-channel audio
|
||||
# Each sample is 2 bytes (int16), 4 channels = 8 bytes per frame
|
||||
mono_data = b''
|
||||
for i in range(0, len(data), 8): # 8 bytes per sample set
|
||||
mono_data += data[i:i+2] # Take first channel only
|
||||
|
||||
frames.append(mono_data)
|
||||
|
||||
# Check for speech
|
||||
is_speech = vad.is_speech(mono_data, SAMPLE_RATE)
|
||||
|
||||
if is_speech:
|
||||
silence_frames = 0
|
||||
else:
|
||||
silence_frames += 1
|
||||
|
||||
# Stop if enough silence after we've recorded something
|
||||
if len(frames) > 10 and silence_frames >= silence_limit:
|
||||
logger.info(f"Silence detected after {len(frames)} frames")
|
||||
while state.running:
|
||||
data = proc.stdout.read(bytes_per_frame)
|
||||
if len(data) < bytes_per_frame:
|
||||
break
|
||||
|
||||
yield data
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
# Convert to WAV
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, 'wb') as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2) # 16-bit
|
||||
wf.setframerate(SAMPLE_RATE)
|
||||
wf.writeframes(b''.join(frames))
|
||||
|
||||
wav_buffer.seek(0)
|
||||
return wav_buffer.read()
|
||||
proc.terminate()
|
||||
proc.wait()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EarTail Integration
|
||||
# EarTail Transcription
|
||||
# ============================================================================
|
||||
|
||||
async def transcribe_audio(audio_data: bytes) -> str:
|
||||
"""Send audio to EarTail and get transcription."""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
# Submit job
|
||||
files = {"audio": ("recording.wav", audio_data, "audio/wav")}
|
||||
response = await client.post(f"{EARTAIL_URL}/transcribe/submit", files=files)
|
||||
response.raise_for_status()
|
||||
|
||||
job_id = response.json().get("job_id")
|
||||
logger.info(f"Transcription job submitted: {job_id}")
|
||||
logger.info(f"Transcription job: {job_id}")
|
||||
|
||||
# Poll for completion
|
||||
for _ in range(60): # Max 60 seconds
|
||||
status_response = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
|
||||
status_data = status_response.json()
|
||||
for _ in range(120):
|
||||
status = await client.get(f"{EARTAIL_URL}/transcribe/status/{job_id}")
|
||||
data = status.json()
|
||||
|
||||
if status_data.get("status") == "SUCCESS":
|
||||
if data.get("status") == "SUCCESS":
|
||||
result = await client.get(f"{EARTAIL_URL}/transcribe/result/{job_id}")
|
||||
return result.json().get("transcription", "")
|
||||
elif status_data.get("status") == "FAILURE":
|
||||
raise Exception(f"Transcription failed: {status_data.get('error')}")
|
||||
elif data.get("status") == "FAILURE":
|
||||
raise Exception(f"Transcription failed: {data.get('error')}")
|
||||
|
||||
await asyncio.sleep(1)
|
||||
|
||||
raise Exception("Transcription timeout")
|
||||
|
||||
|
||||
def transcribe_sync(audio_data: bytes) -> str:
|
||||
"""Synchronous wrapper for transcription."""
|
||||
loop = asyncio.new_event_loop()
|
||||
try:
|
||||
return loop.run_until_complete(transcribe_audio(audio_data))
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Wake Word Listener
|
||||
# Main Listener Loop
|
||||
# ============================================================================
|
||||
|
||||
def wake_word_listener():
|
||||
"""Background thread that listens for wake word."""
|
||||
def audio_to_wav(frames: List[bytes]) -> bytes:
|
||||
"""Convert raw audio frames to WAV format."""
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, 'wb') as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(SAMPLE_RATE)
|
||||
wf.writeframes(b''.join(frames))
|
||||
wav_buffer.seek(0)
|
||||
return wav_buffer.read()
|
||||
|
||||
|
||||
def listener_loop():
|
||||
"""Main audio processing loop."""
|
||||
global state
|
||||
|
||||
logger.info("Starting wake word listener...")
|
||||
|
||||
logger.info("Initializing Porcupine...")
|
||||
try:
|
||||
state.porcupine = pvporcupine.create(
|
||||
porcupine = pvporcupine.create(
|
||||
access_key=PORCUPINE_ACCESS_KEY,
|
||||
keyword_paths=[WAKE_WORD_PATH]
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize Porcupine: {e}")
|
||||
logger.error(f"Failed to init Porcupine: {e}")
|
||||
state.error = str(e)
|
||||
return
|
||||
|
||||
state.audio = pyaudio.PyAudio()
|
||||
vad = webrtcvad.Vad(VAD_AGGRESSIVENESS)
|
||||
|
||||
# Find ReSpeaker device
|
||||
device_index = None
|
||||
for i in range(state.audio.get_device_count()):
|
||||
info = state.audio.get_device_info_by_index(i)
|
||||
if 'seeed' in info['name'].lower() or 'ac108' in info['name'].lower():
|
||||
device_index = i
|
||||
break
|
||||
|
||||
state.stream = state.audio.open(
|
||||
rate=state.porcupine.sample_rate,
|
||||
channels=1,
|
||||
format=pyaudio.paInt16,
|
||||
input=True,
|
||||
input_device_index=device_index,
|
||||
frames_per_buffer=state.porcupine.frame_length
|
||||
)
|
||||
# VAD needs 10/20/30ms frames. 30ms at 16kHz = 480 samples
|
||||
# Porcupine needs 512 samples. We'll use 480 for VAD.
|
||||
vad_frame_size = 480
|
||||
vad_frame_bytes = vad_frame_size * 2
|
||||
|
||||
state.listening = True
|
||||
logger.info("Wake word listener active - say 'Hey Vivi'!")
|
||||
logger.info("🦊 Wake word listener active - say 'Hey Vivi'!")
|
||||
|
||||
while state.running:
|
||||
try:
|
||||
pcm = state.stream.read(state.porcupine.frame_length, exception_on_overflow=False)
|
||||
pcm = struct.unpack_from("h" * state.porcupine.frame_length, pcm)
|
||||
recording_buffer: List[bytes] = []
|
||||
silence_count = 0
|
||||
is_recording = False
|
||||
|
||||
keyword_index = state.porcupine.process(pcm)
|
||||
try:
|
||||
for frame_data in read_audio_stream():
|
||||
if not state.running:
|
||||
break
|
||||
|
||||
if keyword_index >= 0:
|
||||
# Convert bytes to int16 array for Porcupine
|
||||
pcm = struct.unpack_from("h" * 512, frame_data)
|
||||
|
||||
# Check for wake word
|
||||
keyword_index = porcupine.process(pcm)
|
||||
|
||||
if keyword_index >= 0 and not is_recording:
|
||||
logger.info("🦊 Wake word detected: 'Hey Vivi'!")
|
||||
state.wake_count += 1
|
||||
state.last_wake_time = time.time()
|
||||
|
||||
# Visual feedback
|
||||
leds_wakeup()
|
||||
time.sleep(0.3)
|
||||
time.sleep(0.2)
|
||||
leds_listening()
|
||||
|
||||
# Record and transcribe
|
||||
is_recording = True
|
||||
state.recording = True
|
||||
recording_buffer = []
|
||||
silence_count = 0
|
||||
logger.info("Recording started...")
|
||||
continue
|
||||
|
||||
if is_recording:
|
||||
recording_buffer.append(frame_data)
|
||||
|
||||
# Check VAD (use first 480 samples of the 512 frame)
|
||||
vad_data = frame_data[:vad_frame_bytes]
|
||||
try:
|
||||
audio_data = record_until_silence()
|
||||
is_speech = vad.is_speech(vad_data, SAMPLE_RATE)
|
||||
except:
|
||||
is_speech = True # Assume speech on VAD error
|
||||
|
||||
if is_speech:
|
||||
silence_count = 0
|
||||
else:
|
||||
silence_count += 1
|
||||
|
||||
# Stop conditions
|
||||
should_stop = (
|
||||
(len(recording_buffer) > 10 and silence_count >= SILENCE_FRAMES) or
|
||||
len(recording_buffer) >= MAX_RECORDING_FRAMES
|
||||
)
|
||||
|
||||
if should_stop:
|
||||
logger.info(f"Recording stopped: {len(recording_buffer)} frames")
|
||||
is_recording = False
|
||||
state.recording = False
|
||||
|
||||
leds_processing()
|
||||
state.recording = False
|
||||
state.processing = True
|
||||
|
||||
# Transcribe (run in asyncio)
|
||||
loop = asyncio.new_event_loop()
|
||||
transcription = loop.run_until_complete(transcribe_audio(audio_data))
|
||||
loop.close()
|
||||
try:
|
||||
wav_data = audio_to_wav(recording_buffer)
|
||||
transcription = transcribe_sync(wav_data)
|
||||
state.last_transcription = transcription
|
||||
logger.info(f"Transcription: {transcription}")
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription error: {e}")
|
||||
state.error = str(e)
|
||||
finally:
|
||||
state.processing = False
|
||||
leds_off()
|
||||
|
||||
state.last_transcription = transcription
|
||||
logger.info(f"Transcription: {transcription}")
|
||||
recording_buffer = []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Recording/transcription error: {e}")
|
||||
finally:
|
||||
state.recording = False
|
||||
state.processing = False
|
||||
leds_off()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Listener error: {e}")
|
||||
time.sleep(0.1)
|
||||
|
||||
# Cleanup
|
||||
if state.stream:
|
||||
state.stream.close()
|
||||
if state.audio:
|
||||
state.audio.terminate()
|
||||
if state.porcupine:
|
||||
state.porcupine.delete()
|
||||
|
||||
state.listening = False
|
||||
logger.info("Wake word listener stopped")
|
||||
except Exception as e:
|
||||
logger.error(f"Listener error: {e}")
|
||||
state.error = str(e)
|
||||
finally:
|
||||
porcupine.delete()
|
||||
state.listening = False
|
||||
leds_off()
|
||||
logger.info("Listener stopped")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# FastAPI App
|
||||
# FastAPI
|
||||
# ============================================================================
|
||||
|
||||
app = FastAPI(title="HeadMic", description="Vixy's Ears - Wake Word + Voice Recording 🦊👂")
|
||||
|
||||
|
||||
class RecordRequest(BaseModel):
|
||||
duration_sec: float = 5.0
|
||||
|
||||
|
||||
class TranscribeResponse(BaseModel):
|
||||
transcription: str
|
||||
duration_sec: float
|
||||
app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂")
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
"""Start the wake word listener on startup."""
|
||||
state.running = True
|
||||
state.listener_thread = threading.Thread(target=wake_word_listener, daemon=True)
|
||||
state.listener_thread.start()
|
||||
logger.info("HeadMic service started")
|
||||
thread = threading.Thread(target=listener_loop, daemon=True)
|
||||
thread.start()
|
||||
logger.info("HeadMic started")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown():
|
||||
"""Stop the wake word listener on shutdown."""
|
||||
state.running = False
|
||||
leds_off()
|
||||
if state.listener_thread:
|
||||
state.listener_thread.join(timeout=5)
|
||||
logger.info("HeadMic service stopped")
|
||||
|
||||
|
||||
@app.get("/")
|
||||
@@ -389,21 +353,19 @@ async def root():
|
||||
return {
|
||||
"service": "HeadMic",
|
||||
"description": "Vixy's Ears 🦊👂",
|
||||
"wake_word": "Hey Vivi",
|
||||
"status": "listening" if state.listening else "idle"
|
||||
"wake_word": "Hey Vivi"
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {
|
||||
"healthy": state.listening,
|
||||
"healthy": state.listening and not state.error,
|
||||
"listening": state.listening,
|
||||
"recording": state.recording,
|
||||
"processing": state.processing,
|
||||
"wake_count": state.wake_count,
|
||||
"porcupine_loaded": state.porcupine is not None,
|
||||
"eartail_url": EARTAIL_URL
|
||||
"error": state.error
|
||||
}
|
||||
|
||||
|
||||
@@ -415,89 +377,13 @@ async def status():
|
||||
"processing": state.processing,
|
||||
"last_transcription": state.last_transcription,
|
||||
"last_wake_time": state.last_wake_time,
|
||||
"wake_count": state.wake_count
|
||||
"wake_count": state.wake_count,
|
||||
"error": state.error
|
||||
}
|
||||
|
||||
|
||||
@app.post("/record")
|
||||
async def record(request: RecordRequest):
|
||||
"""Manually record for a specified duration."""
|
||||
if state.recording:
|
||||
raise HTTPException(status_code=409, detail="Already recording")
|
||||
|
||||
state.recording = True
|
||||
leds_listening()
|
||||
|
||||
try:
|
||||
# Simple timed recording (not VAD-based)
|
||||
p = pyaudio.PyAudio()
|
||||
frames = []
|
||||
|
||||
stream = p.open(
|
||||
format=pyaudio.paInt16,
|
||||
channels=1,
|
||||
rate=SAMPLE_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=1024
|
||||
)
|
||||
|
||||
for _ in range(int(SAMPLE_RATE / 1024 * request.duration_sec)):
|
||||
data = stream.read(1024)
|
||||
frames.append(data)
|
||||
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
p.terminate()
|
||||
|
||||
# Convert to WAV
|
||||
wav_buffer = io.BytesIO()
|
||||
with wave.open(wav_buffer, 'wb') as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(SAMPLE_RATE)
|
||||
wf.writeframes(b''.join(frames))
|
||||
|
||||
wav_buffer.seek(0)
|
||||
return {"success": True, "size_bytes": len(wav_buffer.getvalue())}
|
||||
|
||||
finally:
|
||||
state.recording = False
|
||||
leds_off()
|
||||
|
||||
|
||||
@app.post("/transcribe")
|
||||
async def transcribe_endpoint(request: RecordRequest):
|
||||
"""Record and transcribe."""
|
||||
if state.recording or state.processing:
|
||||
raise HTTPException(status_code=409, detail="Busy")
|
||||
|
||||
state.recording = True
|
||||
leds_listening()
|
||||
|
||||
try:
|
||||
start = time.time()
|
||||
audio_data = record_until_silence(timeout_sec=request.duration_sec)
|
||||
|
||||
leds_processing()
|
||||
state.recording = False
|
||||
state.processing = True
|
||||
|
||||
transcription = await transcribe_audio(audio_data)
|
||||
duration = time.time() - start
|
||||
|
||||
state.last_transcription = transcription
|
||||
|
||||
return TranscribeResponse(transcription=transcription, duration_sec=duration)
|
||||
|
||||
finally:
|
||||
state.recording = False
|
||||
state.processing = False
|
||||
leds_off()
|
||||
|
||||
|
||||
@app.get("/last")
|
||||
async def last_transcription():
|
||||
"""Get the last transcription."""
|
||||
async def last():
|
||||
return {
|
||||
"transcription": state.last_transcription,
|
||||
"wake_time": state.last_wake_time
|
||||
|
||||
Reference in New Issue
Block a user