diff --git a/.gitignore b/.gitignore index d33801f..4bec6a1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,9 @@ *.ppn Hey-Vivi_*/ +# ML models (downloaded separately) +models/*.tflite + # Python __pycache__/ *.py[cod] diff --git a/headmic.py b/headmic.py index 1852660..bb11c65 100644 --- a/headmic.py +++ b/headmic.py @@ -34,6 +34,7 @@ import wave from pathlib import Path from typing import Optional, List +import numpy as np import httpx import pvporcupine import webrtcvad @@ -117,9 +118,15 @@ class ServiceState: self.last_wake_time: Optional[float] = None self.wake_count = 0 self.error: Optional[str] = None + self.audio_scene: Optional[dict] = None + self.sound_classification_enabled: bool = False state = ServiceState() +# Sound classifier globals +sound_classifier = None +sound_ring_buffer = None # collections.deque, filled by listener_loop + # ============================================================================ # Audio Stream using ALSA directly (arecord) @@ -254,7 +261,11 @@ def listener_loop(): # Convert bytes to int16 array for Porcupine pcm = struct.unpack_from("h" * 512, frame_data) - + + # Feed sound classifier ring buffer + if sound_ring_buffer is not None: + sound_ring_buffer.append(frame_data) + # Check for wake word keyword_index = porcupine.process(pcm) @@ -327,6 +338,31 @@ def listener_loop(): logger.info("Listener stopped") +# ============================================================================ +# Sound Classification Thread +# ============================================================================ + +def sound_classifier_loop(): + """Background thread for continuous sound classification.""" + global state + logger.info("Sound classifier thread started") + while state.running: + if sound_ring_buffer is None or len(sound_ring_buffer) < 30: + time.sleep(0.1) + continue + + try: + frames = list(sound_ring_buffer) + audio = np.frombuffer(b"".join(frames), dtype=np.int16) + result = sound_classifier.classify(audio) + state.audio_scene = result + except Exception as e: + logger.warning("Sound classification error: %s", e) + + time.sleep(0.5) + logger.info("Sound classifier thread stopped") + + # ============================================================================ # FastAPI # ============================================================================ @@ -336,7 +372,30 @@ app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂") @app.on_event("startup") async def startup(): + global sound_classifier, sound_ring_buffer + state.running = True + + # Init sound classifier (optional — graceful if model missing) + model_dir = Path(__file__).parent / "models" + model_path = model_dir / "yamnet.tflite" + class_map_path = model_dir / "yamnet_class_map.csv" + if model_path.exists() and class_map_path.exists(): + try: + from sound_id import SoundClassifier + sound_classifier = SoundClassifier(str(model_path), str(class_map_path)) + # 31 frames of 512 samples = ~0.99s at 16kHz + sound_ring_buffer = collections.deque(maxlen=31) + state.sound_classification_enabled = True + logger.info("Sound classification enabled (YAMNet)") + + sc_thread = threading.Thread(target=sound_classifier_loop, daemon=True) + sc_thread.start() + except Exception as e: + logger.warning("Sound classification unavailable: %s", e) + else: + logger.info("Sound classification models not found, skipping") + thread = threading.Thread(target=listener_loop, daemon=True) thread.start() logger.info("HeadMic started") @@ -365,6 +424,7 @@ async def health(): "recording": state.recording, "processing": state.processing, "wake_count": state.wake_count, + "sound_classification_enabled": state.sound_classification_enabled, "error": state.error } @@ -378,6 +438,7 @@ async def status(): "last_transcription": state.last_transcription, "last_wake_time": state.last_wake_time, "wake_count": state.wake_count, + "audio_scene": state.audio_scene["dominant_category"] if state.audio_scene else None, "error": state.error } @@ -390,6 +451,26 @@ async def last(): } +@app.get("/sounds") +async def sounds(): + """Current audio scene classification.""" + if not state.sound_classification_enabled: + raise HTTPException(status_code=503, detail="Sound classification not available") + if state.audio_scene is None: + return {"category": None, "top_classes": [], "dominant_category": None, "timestamp": None} + return state.audio_scene + + +@app.get("/sounds/history") +async def sounds_history(seconds: int = 30): + """Recent sound classification history.""" + if not state.sound_classification_enabled: + raise HTTPException(status_code=503, detail="Sound classification not available") + if sound_classifier is None: + return {"history": []} + return {"history": sound_classifier.get_history(seconds)} + + if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8446) diff --git a/models/yamnet_class_map.csv b/models/yamnet_class_map.csv new file mode 100644 index 0000000..9c07d81 --- /dev/null +++ b/models/yamnet_class_map.csv @@ -0,0 +1,522 @@ +index,mid,display_name +0,/m/09x0r,Speech +1,/m/0ytgt,"Child speech, kid speaking" +2,/m/01h8n0,Conversation +3,/m/02qldy,"Narration, monologue" +4,/m/0261r1,Babbling +5,/m/0brhx,Speech synthesizer +6,/m/07p6fty,Shout +7,/m/07q4ntr,Bellow +8,/m/07rwj3x,Whoop +9,/m/07sr1lc,Yell +10,/t/dd00135,Children shouting +11,/m/03qc9zr,Screaming +12,/m/02rtxlg,Whispering +13,/m/01j3sz,Laughter +14,/t/dd00001,Baby laughter +15,/m/07r660_,Giggle +16,/m/07s04w4,Snicker +17,/m/07sq110,Belly laugh +18,/m/07rgt08,"Chuckle, chortle" +19,/m/0463cq4,"Crying, sobbing" +20,/t/dd00002,"Baby cry, infant cry" +21,/m/07qz6j3,Whimper +22,/m/07qw_06,"Wail, moan" +23,/m/07plz5l,Sigh +24,/m/015lz1,Singing +25,/m/0l14jd,Choir +26,/m/01swy6,Yodeling +27,/m/02bk07,Chant +28,/m/01c194,Mantra +29,/t/dd00005,Child singing +30,/t/dd00006,Synthetic singing +31,/m/06bxc,Rapping +32,/m/02fxyj,Humming +33,/m/07s2xch,Groan +34,/m/07r4k75,Grunt +35,/m/01w250,Whistling +36,/m/0lyf6,Breathing +37,/m/07mzm6,Wheeze +38,/m/01d3sd,Snoring +39,/m/07s0dtb,Gasp +40,/m/07pyy8b,Pant +41,/m/07q0yl5,Snort +42,/m/01b_21,Cough +43,/m/0dl9sf8,Throat clearing +44,/m/01hsr_,Sneeze +45,/m/07ppn3j,Sniff +46,/m/06h7j,Run +47,/m/07qv_x_,Shuffle +48,/m/07pbtc8,"Walk, footsteps" +49,/m/03cczk,"Chewing, mastication" +50,/m/07pdhp0,Biting +51,/m/0939n_,Gargling +52,/m/01g90h,Stomach rumble +53,/m/03q5_w,"Burping, eructation" +54,/m/02p3nc,Hiccup +55,/m/02_nn,Fart +56,/m/0k65p,Hands +57,/m/025_jnm,Finger snapping +58,/m/0l15bq,Clapping +59,/m/01jg02,"Heart sounds, heartbeat" +60,/m/01jg1z,Heart murmur +61,/m/053hz1,Cheering +62,/m/028ght,Applause +63,/m/07rkbfh,Chatter +64,/m/03qtwd,Crowd +65,/m/07qfr4h,"Hubbub, speech noise, speech babble" +66,/t/dd00013,Children playing +67,/m/0jbk,Animal +68,/m/068hy,"Domestic animals, pets" +69,/m/0bt9lr,Dog +70,/m/05tny_,Bark +71,/m/07r_k2n,Yip +72,/m/07qf0zm,Howl +73,/m/07rc7d9,Bow-wow +74,/m/0ghcn6,Growling +75,/t/dd00136,Whimper (dog) +76,/m/01yrx,Cat +77,/m/02yds9,Purr +78,/m/07qrkrw,Meow +79,/m/07rjwbb,Hiss +80,/m/07r81j2,Caterwaul +81,/m/0ch8v,"Livestock, farm animals, working animals" +82,/m/03k3r,Horse +83,/m/07rv9rh,Clip-clop +84,/m/07q5rw0,"Neigh, whinny" +85,/m/01xq0k1,"Cattle, bovinae" +86,/m/07rpkh9,Moo +87,/m/0239kh,Cowbell +88,/m/068zj,Pig +89,/t/dd00018,Oink +90,/m/03fwl,Goat +91,/m/07q0h5t,Bleat +92,/m/07bgp,Sheep +93,/m/025rv6n,Fowl +94,/m/09b5t,"Chicken, rooster" +95,/m/07st89h,Cluck +96,/m/07qn5dc,"Crowing, cock-a-doodle-doo" +97,/m/01rd7k,Turkey +98,/m/07svc2k,Gobble +99,/m/09ddx,Duck +100,/m/07qdb04,Quack +101,/m/0dbvp,Goose +102,/m/07qwf61,Honk +103,/m/01280g,Wild animals +104,/m/0cdnk,"Roaring cats (lions, tigers)" +105,/m/04cvmfc,Roar +106,/m/015p6,Bird +107,/m/020bb7,"Bird vocalization, bird call, bird song" +108,/m/07pggtn,"Chirp, tweet" +109,/m/07sx8x_,Squawk +110,/m/0h0rv,"Pigeon, dove" +111,/m/07r_25d,Coo +112,/m/04s8yn,Crow +113,/m/07r5c2p,Caw +114,/m/09d5_,Owl +115,/m/07r_80w,Hoot +116,/m/05_wcq,"Bird flight, flapping wings" +117,/m/01z5f,"Canidae, dogs, wolves" +118,/m/06hps,"Rodents, rats, mice" +119,/m/04rmv,Mouse +120,/m/07r4gkf,Patter +121,/m/03vt0,Insect +122,/m/09xqv,Cricket +123,/m/09f96,Mosquito +124,/m/0h2mp,"Fly, housefly" +125,/m/07pjwq1,Buzz +126,/m/01h3n,"Bee, wasp, etc." +127,/m/09ld4,Frog +128,/m/07st88b,Croak +129,/m/078jl,Snake +130,/m/07qn4z3,Rattle +131,/m/032n05,Whale vocalization +132,/m/04rlf,Music +133,/m/04szw,Musical instrument +134,/m/0fx80y,Plucked string instrument +135,/m/0342h,Guitar +136,/m/02sgy,Electric guitar +137,/m/018vs,Bass guitar +138,/m/042v_gx,Acoustic guitar +139,/m/06w87,"Steel guitar, slide guitar" +140,/m/01glhc,Tapping (guitar technique) +141,/m/07s0s5r,Strum +142,/m/018j2,Banjo +143,/m/0jtg0,Sitar +144,/m/04rzd,Mandolin +145,/m/01bns_,Zither +146,/m/07xzm,Ukulele +147,/m/05148p4,Keyboard (musical) +148,/m/05r5c,Piano +149,/m/01s0ps,Electric piano +150,/m/013y1f,Organ +151,/m/03xq_f,Electronic organ +152,/m/03gvt,Hammond organ +153,/m/0l14qv,Synthesizer +154,/m/01v1d8,Sampler +155,/m/03q5t,Harpsichord +156,/m/0l14md,Percussion +157,/m/02hnl,Drum kit +158,/m/0cfdd,Drum machine +159,/m/026t6,Drum +160,/m/06rvn,Snare drum +161,/m/03t3fj,Rimshot +162,/m/02k_mr,Drum roll +163,/m/0bm02,Bass drum +164,/m/011k_j,Timpani +165,/m/01p970,Tabla +166,/m/01qbl,Cymbal +167,/m/03qtq,Hi-hat +168,/m/01sm1g,Wood block +169,/m/07brj,Tambourine +170,/m/05r5wn,Rattle (instrument) +171,/m/0xzly,Maraca +172,/m/0mbct,Gong +173,/m/016622,Tubular bells +174,/m/0j45pbj,Mallet percussion +175,/m/0dwsp,"Marimba, xylophone" +176,/m/0dwtp,Glockenspiel +177,/m/0dwt5,Vibraphone +178,/m/0l156b,Steelpan +179,/m/05pd6,Orchestra +180,/m/01kcd,Brass instrument +181,/m/0319l,French horn +182,/m/07gql,Trumpet +183,/m/07c6l,Trombone +184,/m/0l14_3,Bowed string instrument +185,/m/02qmj0d,String section +186,/m/07y_7,"Violin, fiddle" +187,/m/0d8_n,Pizzicato +188,/m/01xqw,Cello +189,/m/02fsn,Double bass +190,/m/085jw,"Wind instrument, woodwind instrument" +191,/m/0l14j_,Flute +192,/m/06ncr,Saxophone +193,/m/01wy6,Clarinet +194,/m/03m5k,Harp +195,/m/0395lw,Bell +196,/m/03w41f,Church bell +197,/m/027m70_,Jingle bell +198,/m/0gy1t2s,Bicycle bell +199,/m/07n_g,Tuning fork +200,/m/0f8s22,Chime +201,/m/026fgl,Wind chime +202,/m/0150b9,Change ringing (campanology) +203,/m/03qjg,Harmonica +204,/m/0mkg,Accordion +205,/m/0192l,Bagpipes +206,/m/02bxd,Didgeridoo +207,/m/0l14l2,Shofar +208,/m/07kc_,Theremin +209,/m/0l14t7,Singing bowl +210,/m/01hgjl,Scratching (performance technique) +211,/m/064t9,Pop music +212,/m/0glt670,Hip hop music +213,/m/02cz_7,Beatboxing +214,/m/06by7,Rock music +215,/m/03lty,Heavy metal +216,/m/05r6t,Punk rock +217,/m/0dls3,Grunge +218,/m/0dl5d,Progressive rock +219,/m/07sbbz2,Rock and roll +220,/m/05w3f,Psychedelic rock +221,/m/06j6l,Rhythm and blues +222,/m/0gywn,Soul music +223,/m/06cqb,Reggae +224,/m/01lyv,Country +225,/m/015y_n,Swing music +226,/m/0gg8l,Bluegrass +227,/m/02x8m,Funk +228,/m/02w4v,Folk music +229,/m/06j64v,Middle Eastern music +230,/m/03_d0,Jazz +231,/m/026z9,Disco +232,/m/0ggq0m,Classical music +233,/m/05lls,Opera +234,/m/02lkt,Electronic music +235,/m/03mb9,House music +236,/m/07gxw,Techno +237,/m/07s72n,Dubstep +238,/m/0283d,Drum and bass +239,/m/0m0jc,Electronica +240,/m/08cyft,Electronic dance music +241,/m/0fd3y,Ambient music +242,/m/07lnk,Trance music +243,/m/0g293,Music of Latin America +244,/m/0ln16,Salsa music +245,/m/0326g,Flamenco +246,/m/0155w,Blues +247,/m/05fw6t,Music for children +248,/m/02v2lh,New-age music +249,/m/0y4f8,Vocal music +250,/m/0z9c,A capella +251,/m/0164x2,Music of Africa +252,/m/0145m,Afrobeat +253,/m/02mscn,Christian music +254,/m/016cjb,Gospel music +255,/m/028sqc,Music of Asia +256,/m/015vgc,Carnatic music +257,/m/0dq0md,Music of Bollywood +258,/m/06rqw,Ska +259,/m/02p0sh1,Traditional music +260,/m/05rwpb,Independent music +261,/m/074ft,Song +262,/m/025td0t,Background music +263,/m/02cjck,Theme music +264,/m/03r5q_,Jingle (music) +265,/m/0l14gg,Soundtrack music +266,/m/07pkxdp,Lullaby +267,/m/01z7dr,Video game music +268,/m/0140xf,Christmas music +269,/m/0ggx5q,Dance music +270,/m/04wptg,Wedding music +271,/t/dd00031,Happy music +272,/t/dd00033,Sad music +273,/t/dd00034,Tender music +274,/t/dd00035,Exciting music +275,/t/dd00036,Angry music +276,/t/dd00037,Scary music +277,/m/03m9d0z,Wind +278,/m/09t49,Rustling leaves +279,/t/dd00092,Wind noise (microphone) +280,/m/0jb2l,Thunderstorm +281,/m/0ngt1,Thunder +282,/m/0838f,Water +283,/m/06mb1,Rain +284,/m/07r10fb,Raindrop +285,/t/dd00038,Rain on surface +286,/m/0j6m2,Stream +287,/m/0j2kx,Waterfall +288,/m/05kq4,Ocean +289,/m/034srq,"Waves, surf" +290,/m/06wzb,Steam +291,/m/07swgks,Gurgling +292,/m/02_41,Fire +293,/m/07pzfmf,Crackle +294,/m/07yv9,Vehicle +295,/m/019jd,"Boat, Water vehicle" +296,/m/0hsrw,"Sailboat, sailing ship" +297,/m/056ks2,"Rowboat, canoe, kayak" +298,/m/02rlv9,"Motorboat, speedboat" +299,/m/06q74,Ship +300,/m/012f08,Motor vehicle (road) +301,/m/0k4j,Car +302,/m/0912c9,"Vehicle horn, car horn, honking" +303,/m/07qv_d5,Toot +304,/m/02mfyn,Car alarm +305,/m/04gxbd,"Power windows, electric windows" +306,/m/07rknqz,Skidding +307,/m/0h9mv,Tire squeal +308,/t/dd00134,Car passing by +309,/m/0ltv,"Race car, auto racing" +310,/m/07r04,Truck +311,/m/0gvgw0,Air brake +312,/m/05x_td,"Air horn, truck horn" +313,/m/02rhddq,Reversing beeps +314,/m/03cl9h,"Ice cream truck, ice cream van" +315,/m/01bjv,Bus +316,/m/03j1ly,Emergency vehicle +317,/m/04qvtq,Police car (siren) +318,/m/012n7d,Ambulance (siren) +319,/m/012ndj,"Fire engine, fire truck (siren)" +320,/m/04_sv,Motorcycle +321,/m/0btp2,"Traffic noise, roadway noise" +322,/m/06d_3,Rail transport +323,/m/07jdr,Train +324,/m/04zmvq,Train whistle +325,/m/0284vy3,Train horn +326,/m/01g50p,"Railroad car, train wagon" +327,/t/dd00048,Train wheels squealing +328,/m/0195fx,"Subway, metro, underground" +329,/m/0k5j,Aircraft +330,/m/014yck,Aircraft engine +331,/m/04229,Jet engine +332,/m/02l6bg,"Propeller, airscrew" +333,/m/09ct_,Helicopter +334,/m/0cmf2,"Fixed-wing aircraft, airplane" +335,/m/0199g,Bicycle +336,/m/06_fw,Skateboard +337,/m/02mk9,Engine +338,/t/dd00065,Light engine (high frequency) +339,/m/08j51y,"Dental drill, dentist's drill" +340,/m/01yg9g,Lawn mower +341,/m/01j4z9,Chainsaw +342,/t/dd00066,Medium engine (mid frequency) +343,/t/dd00067,Heavy engine (low frequency) +344,/m/01h82_,Engine knocking +345,/t/dd00130,Engine starting +346,/m/07pb8fc,Idling +347,/m/07q2z82,"Accelerating, revving, vroom" +348,/m/02dgv,Door +349,/m/03wwcy,Doorbell +350,/m/07r67yg,Ding-dong +351,/m/02y_763,Sliding door +352,/m/07rjzl8,Slam +353,/m/07r4wb8,Knock +354,/m/07qcpgn,Tap +355,/m/07q6cd_,Squeak +356,/m/0642b4,Cupboard open or close +357,/m/0fqfqc,Drawer open or close +358,/m/04brg2,"Dishes, pots, and pans" +359,/m/023pjk,"Cutlery, silverware" +360,/m/07pn_8q,Chopping (food) +361,/m/0dxrf,Frying (food) +362,/m/0fx9l,Microwave oven +363,/m/02pjr4,Blender +364,/m/02jz0l,"Water tap, faucet" +365,/m/0130jx,Sink (filling or washing) +366,/m/03dnzn,Bathtub (filling or washing) +367,/m/03wvsk,Hair dryer +368,/m/01jt3m,Toilet flush +369,/m/012xff,Toothbrush +370,/m/04fgwm,Electric toothbrush +371,/m/0d31p,Vacuum cleaner +372,/m/01s0vc,Zipper (clothing) +373,/m/03v3yw,Keys jangling +374,/m/0242l,Coin (dropping) +375,/m/01lsmm,Scissors +376,/m/02g901,"Electric shaver, electric razor" +377,/m/05rj2,Shuffling cards +378,/m/0316dw,Typing +379,/m/0c2wf,Typewriter +380,/m/01m2v,Computer keyboard +381,/m/081rb,Writing +382,/m/07pp_mv,Alarm +383,/m/07cx4,Telephone +384,/m/07pp8cl,Telephone bell ringing +385,/m/01hnzm,Ringtone +386,/m/02c8p,"Telephone dialing, DTMF" +387,/m/015jpf,Dial tone +388,/m/01z47d,Busy signal +389,/m/046dlr,Alarm clock +390,/m/03kmc9,Siren +391,/m/0dgbq,Civil defense siren +392,/m/030rvx,Buzzer +393,/m/01y3hg,"Smoke detector, smoke alarm" +394,/m/0c3f7m,Fire alarm +395,/m/04fq5q,Foghorn +396,/m/0l156k,Whistle +397,/m/06hck5,Steam whistle +398,/t/dd00077,Mechanisms +399,/m/02bm9n,"Ratchet, pawl" +400,/m/01x3z,Clock +401,/m/07qjznt,Tick +402,/m/07qjznl,Tick-tock +403,/m/0l7xg,Gears +404,/m/05zc1,Pulleys +405,/m/0llzx,Sewing machine +406,/m/02x984l,Mechanical fan +407,/m/025wky1,Air conditioning +408,/m/024dl,Cash register +409,/m/01m4t,Printer +410,/m/0dv5r,Camera +411,/m/07bjf,Single-lens reflex camera +412,/m/07k1x,Tools +413,/m/03l9g,Hammer +414,/m/03p19w,Jackhammer +415,/m/01b82r,Sawing +416,/m/02p01q,Filing (rasp) +417,/m/023vsd,Sanding +418,/m/0_ksk,Power tool +419,/m/01d380,Drill +420,/m/014zdl,Explosion +421,/m/032s66,"Gunshot, gunfire" +422,/m/04zjc,Machine gun +423,/m/02z32qm,Fusillade +424,/m/0_1c,Artillery fire +425,/m/073cg4,Cap gun +426,/m/0g6b5,Fireworks +427,/g/122z_qxw,Firecracker +428,/m/07qsvvw,"Burst, pop" +429,/m/07pxg6y,Eruption +430,/m/07qqyl4,Boom +431,/m/083vt,Wood +432,/m/07pczhz,Chop +433,/m/07pl1bw,Splinter +434,/m/07qs1cx,Crack +435,/m/039jq,Glass +436,/m/07q7njn,"Chink, clink" +437,/m/07rn7sz,Shatter +438,/m/04k94,Liquid +439,/m/07rrlb6,"Splash, splatter" +440,/m/07p6mqd,Slosh +441,/m/07qlwh6,Squish +442,/m/07r5v4s,Drip +443,/m/07prgkl,Pour +444,/m/07pqc89,"Trickle, dribble" +445,/t/dd00088,Gush +446,/m/07p7b8y,Fill (with liquid) +447,/m/07qlf79,Spray +448,/m/07ptzwd,Pump (liquid) +449,/m/07ptfmf,Stir +450,/m/0dv3j,Boiling +451,/m/0790c,Sonar +452,/m/0dl83,Arrow +453,/m/07rqsjt,"Whoosh, swoosh, swish" +454,/m/07qnq_y,"Thump, thud" +455,/m/07rrh0c,Thunk +456,/m/0b_fwt,Electronic tuner +457,/m/02rr_,Effects unit +458,/m/07m2kt,Chorus effect +459,/m/018w8,Basketball bounce +460,/m/07pws3f,Bang +461,/m/07ryjzk,"Slap, smack" +462,/m/07rdhzs,"Whack, thwack" +463,/m/07pjjrj,"Smash, crash" +464,/m/07pc8lb,Breaking +465,/m/07pqn27,Bouncing +466,/m/07rbp7_,Whip +467,/m/07pyf11,Flap +468,/m/07qb_dv,Scratch +469,/m/07qv4k0,Scrape +470,/m/07pdjhy,Rub +471,/m/07s8j8t,Roll +472,/m/07plct2,Crushing +473,/t/dd00112,"Crumpling, crinkling" +474,/m/07qcx4z,Tearing +475,/m/02fs_r,"Beep, bleep" +476,/m/07qwdck,Ping +477,/m/07phxs1,Ding +478,/m/07rv4dm,Clang +479,/m/07s02z0,Squeal +480,/m/07qh7jl,Creak +481,/m/07qwyj0,Rustle +482,/m/07s34ls,Whir +483,/m/07qmpdm,Clatter +484,/m/07p9k1k,Sizzle +485,/m/07qc9xj,Clicking +486,/m/07rwm0c,Clickety-clack +487,/m/07phhsh,Rumble +488,/m/07qyrcz,Plop +489,/m/07qfgpx,"Jingle, tinkle" +490,/m/07rcgpl,Hum +491,/m/07p78v5,Zing +492,/t/dd00121,Boing +493,/m/07s12q4,Crunch +494,/m/028v0c,Silence +495,/m/01v_m0,Sine wave +496,/m/0b9m1,Harmonic +497,/m/0hdsk,Chirp tone +498,/m/0c1dj,Sound effect +499,/m/07pt_g0,Pulse +500,/t/dd00125,"Inside, small room" +501,/t/dd00126,"Inside, large room or hall" +502,/t/dd00127,"Inside, public space" +503,/t/dd00128,"Outside, urban or manmade" +504,/t/dd00129,"Outside, rural or natural" +505,/m/01b9nn,Reverberation +506,/m/01jnbd,Echo +507,/m/096m7z,Noise +508,/m/06_y0by,Environmental noise +509,/m/07rgkc5,Static +510,/m/06xkwv,Mains hum +511,/m/0g12c5,Distortion +512,/m/08p9q4,Sidetone +513,/m/07szfh9,Cacophony +514,/m/0chx_,White noise +515,/m/0cj0r,Pink noise +516,/m/07p_0gm,Throbbing +517,/m/01jwx6,Vibration +518,/m/07c52,Television +519,/m/06bz3,Radio +520,/m/07hvw1,Field recording diff --git a/sound_id.py b/sound_id.py new file mode 100644 index 0000000..21d95d6 --- /dev/null +++ b/sound_id.py @@ -0,0 +1,182 @@ +""" +Sound Identification Module for HeadMic +YAMNet audio event classifier — CPU TFLite (Edge TPU ready) +""" + +import collections +import csv +import logging +import time + +import numpy as np + +logger = logging.getLogger("sound_id") +logger.setLevel(logging.INFO) + +# YAMNet expects 0.975s of 16kHz audio = 15600 samples +YAMNET_SAMPLE_RATE = 16000 +YAMNET_INPUT_SAMPLES = 15600 + +# Category mapping: group YAMNet's 521 classes into useful buckets +CATEGORY_GROUPS = { + "speech": [ + "Speech", "Child speech, kid speaking", "Conversation", + "Narration, monologue", "Babbling", "Speech synthesizer", + "Shout", "Yell", "Whispering", "Laughter", "Chatter", + ], + "alert": [ + "Doorbell", "Ding-dong", "Knock", "Tap", "Alarm", + "Alarm clock", "Smoke detector, smoke alarm", "Fire alarm", + "Siren", "Buzzer", "Telephone", "Telephone bell ringing", + "Ringtone", "Telephone dialing, DTMF", "Bell", "Church bell", + ], + "music": [ + "Music", "Musical instrument", "Singing", "Song", "Guitar", + "Electric guitar", "Acoustic guitar", "Bass guitar", "Piano", + "Keyboard (musical)", "Drum", "Drum kit", "Snare drum", + "Bass drum", "Cymbal", "Hi-hat", "Violin, fiddle", "Flute", + "Trumpet", "Saxophone", "Harmonica", "Organ", "Synthesizer", + "Plucked string instrument", "Strum", "Hip hop music", + "Pop music", "Rock music", "Heavy metal", "Punk rock", + "Grunge", "Progressive rock", "Rock and roll", "Jazz", + "Blues", "Soul music", "Reggae", "Country", "Swing music", + "Bluegrass", "Funk", "Folk music", "Middle Eastern music", + "Electronic music", "House music", "Techno", "Dubstep", + "Drum and bass", "Electronica", "Electronic dance music", + "Ambient music", "Trance music", "Music of Latin America", + "Salsa music", "Flamenco", "Gospel music", "Christian music", + "Music of Africa", "Afrobeat", "Music of Asia", + ], + "animal": [ + "Dog", "Bark", "Howl", "Bow-wow", "Growling", "Whimper", + "Cat", "Purr", "Meow", "Hiss", "Caterwaul", + "Bird", "Bird vocalization, bird call, bird song", "Chirp, tweet", + "Squawk", "Crow", "Caw", "Owl", "Pigeon, dove", + "Insect", "Cricket", "Mosquito", "Fly, housefly", "Bee, wasp, etc.", + "Frog", "Rooster", "Chicken, hen", "Duck", "Goose", + ], + "household": [ + "Door", "Doorbell", "Sliding door", "Slam", + "Drawer open or close", "Cupboard open or close", + "Cutlery, silverware", "Dishes, pots, and pans", + "Glass", "Chink, clink", "Water tap, faucet", + "Sink (filling or washing)", "Bathtub (filling or washing)", + "Toilet flush", "Vacuum cleaner", "Blender", + "Microwave oven", "Typing", "Computer keyboard", + "Writing", "Keys jangling", "Coin (dropping)", + "Footsteps", "Walk, footsteps", "Run", + "Clapping", "Finger snapping", + ], + "environment": [ + "Wind", "Rustling leaves", "Wind noise (microphone)", + "Rain", "Raindrop", "Rain on surface", "Thunder", + "Thunderstorm", "Stream", "Waterfall", "Ocean", + "Waves, surf", "Traffic noise, roadway noise", + "Car", "Engine", "Idling", "Truck", "Bus", "Motorcycle", + "Aircraft", "Aircraft engine", "Helicopter", + "Train", "Railroad car, train wagon", + ], + "silence": [ + "Silence", "White noise", "Static", + ], +} + + +class SoundClassifier: + def __init__(self, model_path, class_map_path, use_edgetpu=False): + # Load class names + self._class_names = [] + with open(class_map_path) as f: + reader = csv.reader(f) + next(reader) # skip header + for row in reader: + self._class_names.append(row[2]) + logger.info("Loaded %d YAMNet class names", len(self._class_names)) + + # Build reverse lookup: class_name -> category + self._class_to_category = {} + for category, names in CATEGORY_GROUPS.items(): + for name in names: + self._class_to_category[name] = category + + # Load TFLite model + import ai_edge_litert.interpreter as tfl + + if use_edgetpu: + delegate = tfl.load_delegate("libedgetpu.so.1") + self._interp = tfl.Interpreter( + model_path=str(model_path), + experimental_delegates=[delegate], + ) + logger.info("YAMNet loaded on Edge TPU") + else: + self._interp = tfl.Interpreter(model_path=str(model_path)) + logger.info("YAMNet loaded on CPU") + + self._interp.allocate_tensors() + self._input = self._interp.get_input_details()[0] + self._output = self._interp.get_output_details()[0] + logger.info( + "YAMNet ready: input %s %s, output %s", + self._input["shape"], self._input["dtype"], self._output["shape"], + ) + + # Classification history for smoothing + self._history = collections.deque(maxlen=20) + + def classify(self, audio_int16): + """Classify an audio buffer. + + Args: + audio_int16: numpy int16 array of PCM samples at 16kHz + + Returns: + dict with category, top_classes, dominant_category, timestamp + """ + # Convert int16 PCM to float32 [-1.0, 1.0] + audio_f32 = audio_int16.astype(np.float32) / 32768.0 + + # Pad or trim to exact input size + if len(audio_f32) < YAMNET_INPUT_SAMPLES: + audio_f32 = np.pad(audio_f32, (0, YAMNET_INPUT_SAMPLES - len(audio_f32))) + elif len(audio_f32) > YAMNET_INPUT_SAMPLES: + audio_f32 = audio_f32[-YAMNET_INPUT_SAMPLES:] + + self._interp.set_tensor(self._input["index"], audio_f32) + self._interp.invoke() + + scores = self._interp.get_tensor(self._output["index"])[0] + + # Top 5 classes + top_indices = np.argsort(scores)[-5:][::-1] + top_classes = [] + for idx in top_indices: + name = self._class_names[idx] if idx < len(self._class_names) else "Unknown" + top_classes.append({"name": name, "score": round(float(scores[idx]), 3)}) + + # Map top class to category + top_name = top_classes[0]["name"] + category = self._class_to_category.get(top_name, "other") + + # Update history + now = time.time() + self._history.append({"category": category, "timestamp": now}) + + # Dominant category from recent history + if self._history: + cat_counts = collections.Counter(h["category"] for h in self._history) + dominant = cat_counts.most_common(1)[0][0] + else: + dominant = category + + return { + "category": category, + "top_classes": top_classes, + "dominant_category": dominant, + "timestamp": now, + } + + def get_history(self, seconds=30): + """Return recent classification history.""" + cutoff = time.time() - seconds + return [h for h in self._history if h["timestamp"] >= cutoff]