Add core service infrastructure

- NATS event bus (pub/sub, JetStream, KV storage)
- Service registry with health monitoring
- Base service class with lifecycle management
- Config system
- Logger with Vi formatting

Adapted from Lyra's patterns, namespace changed to vi.*

🦊💕
This commit is contained in:
Alex Kazaiev
2026-01-02 13:04:26 -06:00
parent c891740d7c
commit e2d24a66f1
8 changed files with 1321 additions and 0 deletions

31
core/__init__.py Normal file
View File

@@ -0,0 +1,31 @@
# Vi Core Module
# Nervous system infrastructure for Vivienne Rousseau
from .config import config
from .logger import setup_logger, logger
from .nats_event_bus import NatsEventBus, nats_bus
from .service_registry import (
ServiceRegistry,
ServiceManifest,
ServiceOperation,
ServiceInstance,
ServiceStatus,
service_registry
)
from .base_service import BaseService, SimpleService
__all__ = [
'config',
'setup_logger',
'logger',
'NatsEventBus',
'nats_bus',
'ServiceRegistry',
'ServiceManifest',
'ServiceOperation',
'ServiceInstance',
'ServiceStatus',
'service_registry',
'BaseService',
'SimpleService',
]

397
core/base_service.py Normal file
View File

@@ -0,0 +1,397 @@
"""
Base Service Class for Vi
Provides standardized service lifecycle management, registration, and health monitoring.
All Vi services should inherit from this base class.
"""
import asyncio
import json
import time
import uuid
from abc import ABC, abstractmethod
from typing import Dict, Any, List, Optional
from datetime import datetime
from .logger import setup_logger
from .service_registry import ServiceManifest, ServiceOperation, ServiceStatus, service_registry
logger = setup_logger('base_service')
class BaseService(ABC):
"""Base class for all Vi services providing standardized lifecycle management"""
def __init__(self, service_id: str, event_bus=None):
self.service_id = service_id
self.event_bus = event_bus
self.instance_id = f"{service_id}-{uuid.uuid4().hex[:8]}"
self._running = False
self._heartbeat_task = None
self._health_check_task = None
self.heartbeat_interval = 60
self.health_check_interval = 10
self._health_data = {}
self._status = ServiceStatus.UNKNOWN
self._heartbeat_failures = 0
self._max_heartbeat_failures = 3
self._registration_confirmed = False
self._first_heartbeat = True
self.logger = setup_logger(service_id, service_name=service_id)
@abstractmethod
def get_service_manifest(self) -> ServiceManifest:
"""Return service manifest with operations and metadata"""
pass
@abstractmethod
async def initialize_service(self):
"""Initialize service-specific resources"""
pass
@abstractmethod
async def cleanup_service(self):
"""Cleanup service-specific resources"""
pass
@abstractmethod
async def perform_health_check(self) -> Dict[str, Any]:
"""Perform service-specific health check"""
pass
async def start(self, event_bus=None):
"""Start the service with full lifecycle management"""
if event_bus:
self.event_bus = event_bus
if not self.event_bus:
raise ValueError("Event bus is required")
try:
self.logger.info(f"[🚀] Starting service: {self.service_id}")
if self.service_id == 'health':
await service_registry.initialize(self.event_bus)
manifest = self.get_service_manifest()
if self.service_id == 'health':
service_registry.register_service(self.service_id, manifest, self.instance_id)
self._registration_confirmed = True
else:
await self._send_registration_message(manifest)
await self.initialize_service()
self._running = True
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
self._health_check_task = asyncio.create_task(self._health_check_loop())
self._status = ServiceStatus.HEALTHY
await self._send_heartbeat()
self.logger.info(f"[✅] Service started: {self.service_id}")
except Exception as e:
self.logger.exception(f"[❌] Failed to start {self.service_id}: {e}")
self._status = ServiceStatus.UNHEALTHY
raise
async def stop(self):
"""Stop the service gracefully"""
self.logger.info(f"[🛑] Stopping service: {self.service_id}")
self._running = False
self._status = ServiceStatus.OFFLINE
if self._heartbeat_task:
self._heartbeat_task.cancel()
try:
await self._heartbeat_task
except asyncio.CancelledError:
pass
if self._health_check_task:
self._health_check_task.cancel()
try:
await self._health_check_task
except asyncio.CancelledError:
pass
try:
await self.cleanup_service()
await self._send_heartbeat()
if self.service_id == 'health':
service_registry.deregister_service(self.service_id)
else:
await self._send_deregistration_message()
self.logger.info(f"[✅] Service stopped: {self.service_id}")
except Exception as e:
self.logger.exception(f"[❌] Error during shutdown: {e}")
async def _heartbeat_loop(self):
"""Background task to send periodic heartbeats"""
while self._running:
try:
await self._send_heartbeat()
await asyncio.sleep(self.heartbeat_interval)
except asyncio.CancelledError:
break
except Exception as e:
self.logger.exception(f"[💔] Heartbeat error: {e}")
await asyncio.sleep(5)
async def _health_check_loop(self):
"""Background task for periodic health checks"""
while self._running:
try:
health_data = await self.perform_health_check()
self._health_data = health_data
if health_data.get('healthy', True):
if self._status == ServiceStatus.UNHEALTHY:
self._status = ServiceStatus.HEALTHY
self.logger.info(f"[💚] Service recovered")
else:
if self._status == ServiceStatus.HEALTHY:
self._status = ServiceStatus.UNHEALTHY
self.logger.warning(f"[💔] Service unhealthy")
await asyncio.sleep(self.health_check_interval)
except asyncio.CancelledError:
break
except Exception as e:
self.logger.exception(f"[💔] Health check error: {e}")
self._status = ServiceStatus.UNHEALTHY
await asyncio.sleep(10)
async def _send_heartbeat(self):
"""Send heartbeat to service registry"""
try:
health_data = {
'status': self._status.value,
'timestamp': datetime.utcnow().isoformat(),
'instance_id': self.instance_id,
**self._health_data
}
if self.service_id == 'health':
service_registry.update_service_heartbeat(self.service_id, health_data)
self._heartbeat_failures = 0
else:
if self.event_bus:
await self._send_resilient_heartbeat(health_data)
except Exception as e:
self.logger.exception(f"[💔] Failed to send heartbeat: {e}")
self._heartbeat_failures += 1
if self._heartbeat_failures >= self._max_heartbeat_failures:
await self._attempt_reregistration()
async def request_service(self, target_service: str, operation: str,
payload: Dict[str, Any], timeout: float = 5.0) -> Dict[str, Any]:
"""Make a request to another service"""
topic = f"vi.services.{target_service}.{operation}"
try:
request_data = json.dumps(payload).encode()
response_msg = await self.event_bus.client.request(topic, request_data, timeout=timeout)
return json.loads(response_msg.data.decode())
except Exception as e:
self.logger.exception(f"[🔗] Request failed {target_service}.{operation}: {e}")
raise
async def emit_event(self, event_type: str, payload: Dict[str, Any]):
"""Emit an event using standardized topic naming"""
if not self.event_bus:
raise ValueError("Event bus not available")
await self.event_bus.emit(event_type, payload)
def register_handler(self, operation: str, handler):
"""Register a request-reply handler for a service operation"""
if not self.event_bus:
raise ValueError("Event bus not available")
async def wrapped_handler(msg):
try:
result = await handler(msg)
if result is not None:
await msg.respond(json.dumps(result).encode())
except Exception as e:
error_response = {"error": str(e), "status": "error"}
await msg.respond(json.dumps(error_response).encode())
self.logger.error(f"Handler error for {operation}: {e}")
topic = f"vi.services.{self.service_id}.{operation}"
return self.event_bus.on(topic, wrapped_handler)
def create_service_operation(self, operation_id: str, description: str,
request_topic: Optional[str] = None,
response_pattern: str = "request-reply",
timeout_ms: int = 5000,
parameters: Optional[List[Dict[str, Any]]] = None,
metadata: Optional[Dict[str, Any]] = None) -> ServiceOperation:
"""Helper to create a ServiceOperation"""
if request_topic is None:
request_topic = f"vi.services.{self.service_id}.{operation_id}"
return ServiceOperation(
operation_id=operation_id,
description=description,
request_topic=request_topic,
response_pattern=response_pattern,
parameters=parameters or [],
timeout_ms=timeout_ms,
metadata=metadata or {}
)
def get_status(self) -> ServiceStatus:
return self._status
def set_status(self, status: ServiceStatus):
if self._status != status:
self.logger.info(f"[📊] Status: {self._status.value}{status.value}")
self._status = status
def update_health_data(self, health_data: Dict[str, Any]):
self._health_data.update(health_data)
def get_health_data(self) -> Dict[str, Any]:
return self._health_data.copy()
def get_service_info(self) -> Dict[str, Any]:
manifest = self.get_service_manifest()
return {
'service_id': self.service_id,
'instance_id': self.instance_id,
'status': self._status.value,
'manifest': manifest.__dict__,
'health_data': self._health_data,
'running': self._running
}
async def _send_registration_message(self, manifest: ServiceManifest):
"""Send registration message via NATS"""
if not self.event_bus:
return
registration_payload = {
'service_id': self.service_id,
'instance_id': self.instance_id,
'manifest': {
'service_id': manifest.service_id,
'name': manifest.name,
'description': manifest.description,
'version': manifest.version,
'operations': [op.__dict__ for op in manifest.operations],
'health_check_topic': manifest.health_check_topic,
'metadata': manifest.metadata
}
}
await self.event_bus.emit("vi.services.register", registration_payload)
self.logger.info(f"[🗂️] Registered: {self.service_id}")
async def _send_deregistration_message(self):
"""Send deregistration message via NATS"""
if not self.event_bus:
return
await self.event_bus.emit("vi.services.deregister", {
'service_id': self.service_id,
'instance_id': self.instance_id
})
self.logger.info(f"[🗂️] Deregistered: {self.service_id}")
async def _send_resilient_heartbeat(self, health_data: Dict[str, Any]):
"""Send heartbeat with acknowledgment"""
try:
heartbeat_payload = {
'service_id': self.service_id,
'instance_id': self.instance_id,
'health_data': health_data
}
request_data = json.dumps(heartbeat_payload).encode()
response_msg = await self.event_bus.client.request(
"vi.services.heartbeat",
request_data,
timeout=5.0
)
response = json.loads(response_msg.data.decode())
acknowledged = response.get('acknowledged', False)
if acknowledged:
self._heartbeat_failures = 0
if not self._registration_confirmed:
self._registration_confirmed = True
self.logger.info(f"[✅] Registration confirmed")
else:
self._registration_confirmed = False
await self._attempt_reregistration()
except Exception as e:
self._heartbeat_failures += 1
self.logger.warning(f"[💔] Heartbeat failed ({self._heartbeat_failures}): {e}")
if self._heartbeat_failures >= self._max_heartbeat_failures:
await self._attempt_reregistration()
async def _attempt_reregistration(self):
"""Attempt to re-register service"""
self.logger.warning(f"[🔄] Re-registering {self.service_id}")
try:
self._heartbeat_failures = 0
manifest = self.get_service_manifest()
await self._send_registration_message(manifest)
self.logger.info(f"[✅] Re-registered {self.service_id}")
except Exception as e:
self.logger.error(f"[❌] Re-registration failed: {e}")
class SimpleService(BaseService):
"""Simple implementation of BaseService"""
def __init__(self, service_id: str, name: str = "", description: str = "",
version: str = "1.0.0", operations: Optional[List[ServiceOperation]] = None,
event_bus=None):
super().__init__(service_id, event_bus)
self._name = name or service_id
self._description = description or f"Service: {service_id}"
self._version = version
self._operations = operations or []
def get_service_manifest(self) -> ServiceManifest:
return ServiceManifest(
service_id=self.service_id,
name=self._name,
description=self._description,
version=self._version,
operations=self._operations,
health_check_topic=f"vi.services.{self.service_id}.health"
)
async def initialize_service(self):
pass
async def cleanup_service(self):
pass
async def perform_health_check(self) -> Dict[str, Any]:
return {
'healthy': True,
'checks': {
'running': self._running,
'event_bus': self.event_bus is not None
}
}

94
core/config.py Normal file
View File

@@ -0,0 +1,94 @@
import json
import os
from pathlib import Path
from typing import Dict, Any
class Config:
_instance = None
_config_data = None
def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if self._config_data is None:
self._load_config()
def _load_config(self):
config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.json')
config_path = os.path.abspath(config_path)
try:
with open(config_path, 'r') as f:
self._config_data = json.load(f)
except FileNotFoundError:
# Default config if file doesn't exist
self._config_data = {
'nats': {
'url': 'nats://localhost:4222',
'connection_timeout': 30,
'max_reconnect_attempts': 10
}
}
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in configuration file: {e}")
def get(self, key: str, default: Any = None) -> Any:
keys = key.split('.')
value = self._config_data
for k in keys:
if isinstance(value, dict) and k in value:
value = value[k]
else:
return default
return value
@property
def nats_url(self) -> str:
return os.getenv('NATS_URL') or self.get('nats.url', 'nats://localhost:4222')
@property
def nats_connection_timeout(self) -> int:
return self.get('nats.connection_timeout', 30)
@property
def nats_max_reconnect_attempts(self) -> int:
return self.get('nats.max_reconnect_attempts', 10)
def get_local_services(self) -> list:
"""Get list of services that should run on this node"""
systemd_services = self.get('auto_upgrade.systemd_services', [])
local_services = []
for systemd_service in systemd_services:
service_name = systemd_service.replace('vi-', '').replace('.service', '')
local_services.append(service_name)
return local_services
def is_service_enabled(self, service_name: str) -> bool:
"""Check if a service is enabled on this node"""
return service_name in self.get_local_services()
@property
def project_root(self) -> Path:
"""Get the project root directory"""
return Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@property
def data_dir(self) -> Path:
"""Get the data directory for databases"""
data_path = os.getenv('DATA_DIR', str(self.project_root / 'data'))
data_dir = Path(data_path)
data_dir.mkdir(exist_ok=True, parents=True)
return data_dir
config = Config()
# Database paths
SHORT_TERM_DB = config.data_dir / 'memory_active.db'
LONG_TERM_DB = config.data_dir / 'memory_archive.db'

73
core/logger.py Normal file
View File

@@ -0,0 +1,73 @@
import logging
import sys
import os
from datetime import datetime
from logging.handlers import TimedRotatingFileHandler
class ViFormatter(logging.Formatter):
"""Custom formatter for Vi services with symbolic level indicators"""
def format(self, record):
timestamp = datetime.now().strftime('%H:%M:%S')
level_map = {
'DEBUG': '·',
'INFO': '',
'WARNING': '',
'ERROR': '',
'CRITICAL': ''
}
level_symbol = level_map.get(record.levelname, '?')
return f"[{timestamp}] {level_symbol} {record.getMessage()}"
def _get_project_root():
"""Find the project root directory by looking for core/ directory"""
current_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.dirname(current_dir)
def setup_logger(name="vi", level=logging.INFO, service_name=None):
"""Set up a logger with console and optional file output"""
# Allow environment variable to override log level
env_level = os.getenv('LOG_LEVEL', '').upper()
if env_level in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
level = getattr(logging, env_level)
logger = logging.getLogger(name)
if logger.handlers:
return logger
logger.setLevel(level)
# Console handler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(ViFormatter())
logger.addHandler(console_handler)
# File handler with daily rotation if service_name is specified
if service_name:
project_root = _get_project_root()
logs_dir = os.path.join(project_root, 'logs')
if not os.path.exists(logs_dir):
os.makedirs(logs_dir, exist_ok=True)
log_file = os.path.join(logs_dir, f"{service_name}.log")
file_handler = TimedRotatingFileHandler(
log_file,
when='midnight',
interval=1,
backupCount=30,
encoding='utf-8'
)
file_handler.setFormatter(ViFormatter())
logger.addHandler(file_handler)
logger.propagate = False
return logger
logger = setup_logger()

209
core/nats_event_bus.py Normal file
View File

@@ -0,0 +1,209 @@
"""
NATS Event Bus for Vi
Provides pub/sub messaging, JetStream durable queues, and KV storage.
"""
import json
from nats.aio.client import Client as NATS
from nats.errors import TimeoutError as NatsTimeoutError
from .config import config
from .logger import logger
class NatsEventBus:
def __init__(self):
self.nc = NATS()
self.subscriptions = {}
self._connected = False
self._js = None
async def connect(self, url=None):
if self._connected:
return
url = url or config.nats_url
logger.info(f"[NATS] Connecting to {url}")
try:
await self.nc.connect(
servers=[url],
connect_timeout=config.nats_connection_timeout,
max_reconnect_attempts=config.nats_max_reconnect_attempts,
)
self._connected = True
self._js = self.nc.jetstream()
logger.info("[NATS] Connected successfully")
except Exception as e:
logger.error(f"[NATS] Connection failed: {e}")
raise
async def emit(self, topic, payload):
if not self._connected:
logger.error("[NATS] Cannot emit - EventBus is not connected")
raise RuntimeError("EventBus is not connected")
try:
await self.nc.publish(topic, json.dumps(payload).encode())
logger.debug(f"[NATS] Emitted to {topic}")
except Exception as e:
logger.error(f"[NATS] Failed to emit to {topic}: {e}")
raise
async def on(self, topic, handler):
if not self._connected:
logger.error("[NATS] Cannot subscribe - EventBus is not connected")
raise RuntimeError("EventBus is not connected")
async def wrapper(msg):
try:
data = msg.data.decode()
logger.debug(f"[NATS] Received message on {topic}")
except Exception as e:
logger.error(f"[NATS] Failed to decode message on {topic}: {e}")
return
try:
await handler(msg)
except Exception as e:
logger.error(f"[NATS] Handler error for {topic}: {e}")
try:
sub = await self.nc.subscribe(topic, cb=wrapper)
self.subscriptions[topic] = sub
logger.info(f"[NATS] Subscribed to {topic}")
except Exception as e:
logger.error(f"[NATS] Failed to subscribe to {topic}: {e}")
raise
async def off(self, topic):
sub = self.subscriptions.pop(topic, None)
if sub:
try:
await sub.unsubscribe()
logger.info(f"[NATS] Unsubscribed from {topic}")
except Exception as e:
logger.error(f"[NATS] Failed to unsubscribe from {topic}: {e}")
async def jetstream_pull(self, stream_name, consumer_name, timeout=1.0):
"""Pull next message from JetStream consumer"""
if not self._connected:
raise RuntimeError("EventBus is not connected")
if not self._js:
raise RuntimeError("JetStream not initialized")
try:
sub = await self._js.pull_subscribe(
subject="",
durable=consumer_name,
stream=stream_name,
)
msgs = await sub.fetch(batch=1, timeout=timeout)
return msgs[0] if msgs else None
except NatsTimeoutError:
return None
except Exception as e:
logger.error(f"[NATS] JetStream pull error: {e}")
raise
async def kv_get_bucket(self, bucket_name: str, ttl_seconds: int = 1800):
"""Get or create a NATS KV bucket"""
if not self._connected:
raise RuntimeError("EventBus is not connected")
if not self._js:
raise RuntimeError("JetStream not initialized")
try:
kv = await self._js.key_value(bucket=bucket_name)
logger.debug(f"[NATS KV] Using existing bucket: {bucket_name}")
return kv
except Exception:
try:
kv = await self._js.create_key_value(
bucket=bucket_name,
ttl=ttl_seconds
)
logger.info(f"[NATS KV] Created bucket: {bucket_name} (TTL: {ttl_seconds}s)")
return kv
except Exception as e:
logger.error(f"[NATS KV] Failed to create bucket {bucket_name}: {e}")
raise
async def kv_put(self, bucket_name: str, key: str, value: bytes, ttl_seconds: int = 1800):
"""Put a value in a KV bucket"""
try:
kv = await self.kv_get_bucket(bucket_name, ttl_seconds)
await kv.put(key, value)
logger.debug(f"[NATS KV] Put: {bucket_name}/{key}")
except Exception as e:
logger.error(f"[NATS KV] Put error for {bucket_name}/{key}: {e}")
raise
async def kv_get(self, bucket_name: str, key: str):
"""Get a value from a KV bucket"""
try:
kv = await self.kv_get_bucket(bucket_name)
entry = await kv.get(key)
logger.debug(f"[NATS KV] Get: {bucket_name}/{key}")
return entry.value if entry else None
except Exception as e:
logger.error(f"[NATS KV] Get error for {bucket_name}/{key}: {e}")
return None
async def kv_keys(self, bucket_name: str, filter_prefix: str = None):
"""List keys in a KV bucket"""
try:
kv = await self.kv_get_bucket(bucket_name)
keys = await kv.keys()
if filter_prefix:
keys = [k for k in keys if k.startswith(filter_prefix)]
logger.debug(f"[NATS KV] Listed {len(keys)} keys from {bucket_name}")
return keys
except Exception as e:
logger.error(f"[NATS KV] Keys error for {bucket_name}: {e}")
return []
async def kv_delete(self, bucket_name: str, key: str):
"""Delete a key from a KV bucket"""
try:
kv = await self.kv_get_bucket(bucket_name)
await kv.delete(key)
logger.debug(f"[NATS KV] Deleted: {bucket_name}/{key}")
except Exception as e:
logger.error(f"[NATS KV] Delete error for {bucket_name}/{key}: {e}")
raise
async def close(self):
if not self._connected:
return
logger.info("[NATS] Closing connection")
try:
await self.nc.drain()
self._connected = False
self.subscriptions.clear()
self._js = None
logger.info("[NATS] Connection closed successfully")
except Exception as e:
logger.error(f"[NATS] Error closing connection: {e}")
@property
def is_connected(self):
return self._connected
@property
def client(self):
"""Access to underlying NATS client for request-reply operations"""
if not self._connected:
raise RuntimeError("EventBus is not connected - cannot access NATS client")
return self.nc
async def __aenter__(self):
await self.connect()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
nats_bus = NatsEventBus()

476
core/service_registry.py Normal file
View File

@@ -0,0 +1,476 @@
"""
Service Registry System for Vi
Manages service lifecycle, discovery, and health monitoring.
"""
import asyncio
import json
import time
from typing import Dict, Any, List, Optional, Set
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from .logger import setup_logger
logger = setup_logger('service_registry')
class ServiceStatus(Enum):
"""Service status enumeration"""
UNKNOWN = "unknown"
REGISTERING = "registering"
HEALTHY = "healthy"
DEGRADED = "degraded"
UNHEALTHY = "unhealthy"
OFFLINE = "offline"
@dataclass
class ServiceOperation:
"""Represents a service operation/endpoint"""
operation_id: str
description: str
request_topic: str
response_pattern: str = "request-reply"
parameters: List[Dict[str, Any]] = field(default_factory=list)
timeout_ms: int = 5000
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class ServiceManifest:
"""Service manifest containing metadata and capabilities"""
service_id: str
name: str = ""
description: str = ""
version: str = "1.0.0"
namespace: str = "vi"
operations: List[ServiceOperation] = field(default_factory=list)
dependencies: List[str] = field(default_factory=list)
health_check_topic: str = ""
heartbeat_interval: int = 30
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class ServiceInstance:
"""Represents a specific instance of a service"""
service_id: str
instance_id: str
manifest: ServiceManifest
status: ServiceStatus = ServiceStatus.UNKNOWN
last_heartbeat: datetime = field(default_factory=datetime.utcnow)
last_health_check: datetime = field(default_factory=datetime.utcnow)
health_data: Dict[str, Any] = field(default_factory=dict)
registered_at: datetime = field(default_factory=datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict)
class ServiceRegistry:
"""Central service registry managing all service discovery and health monitoring"""
def __init__(self, heartbeat_timeout: int = 60, health_check_interval: int = 30):
self.services: Dict[str, ServiceInstance] = {}
self.service_operations: Dict[str, ServiceOperation] = {}
self.service_dependencies: Dict[str, Set[str]] = {}
self.heartbeat_timeout = heartbeat_timeout
self.health_check_interval = health_check_interval
self.event_bus = None
self._health_monitor_task = None
self._running = False
async def initialize(self, event_bus):
"""Initialize the service registry with event bus"""
if self._running:
logger.info("[🗂️] Re-initializing service registry")
await self.shutdown()
self.event_bus = event_bus
await self._register_handlers()
self._running = True
self._health_monitor_task = asyncio.create_task(self._health_monitor_loop())
logger.info("[🗂️] Service registry initialized")
async def shutdown(self):
"""Shutdown the service registry"""
self._running = False
if self._health_monitor_task:
self._health_monitor_task.cancel()
try:
await self._health_monitor_task
except asyncio.CancelledError:
pass
if self.event_bus:
try:
await self.event_bus.off("vi.services.register")
await self.event_bus.off("vi.services.deregister")
await self.event_bus.off("vi.services.heartbeat")
await self.event_bus.off("vi.services.discover")
await self.event_bus.off("vi.services.list")
await self.event_bus.off("vi.services.health")
except Exception as e:
logger.debug(f"[🗂️] Handler cleanup: {e}")
logger.info("[🗂️] Service registry shutdown")
async def _register_handlers(self):
"""Register NATS handlers for service registry operations"""
if not self.event_bus:
return
await self.event_bus.on("vi.services.register", self._handle_service_register)
await self.event_bus.on("vi.services.deregister", self._handle_service_deregister)
await self.event_bus.on("vi.services.heartbeat", self._handle_service_heartbeat)
await self.event_bus.on("vi.services.discover", self._handle_service_discover)
await self.event_bus.on("vi.services.list", self._handle_service_list)
await self.event_bus.on("vi.services.health", self._handle_health_request)
logger.debug("[🗂️] Service registry handlers registered")
def _serialize_manifest(self, manifest: ServiceManifest) -> Dict[str, Any]:
"""Serialize ServiceManifest to JSON-compatible dictionary"""
return {
'service_id': manifest.service_id,
'name': manifest.name,
'description': manifest.description,
'version': manifest.version,
'namespace': manifest.namespace,
'operations': [op.__dict__ for op in manifest.operations],
'dependencies': manifest.dependencies,
'health_check_topic': manifest.health_check_topic,
'heartbeat_interval': manifest.heartbeat_interval,
'metadata': manifest.metadata
}
def register_service(self, service_id: str, manifest: ServiceManifest,
instance_id: Optional[str] = None) -> str:
"""Register a service with the registry"""
if instance_id is None:
instance_id = f"{service_id}-{int(time.time())}"
instance = ServiceInstance(
service_id=service_id,
instance_id=instance_id,
manifest=manifest,
status=ServiceStatus.REGISTERING
)
self.services[service_id] = instance
for operation in manifest.operations:
operation_key = f"{service_id}.{operation.operation_id}"
self.service_operations[operation_key] = operation
if manifest.dependencies:
self.service_dependencies[service_id] = set(manifest.dependencies)
logger.info(f"[🗂️] Registered service: {service_id} (instance: {instance_id})")
return instance_id
def deregister_service(self, service_id: str) -> bool:
"""Deregister a service from the registry"""
instance = self.services.pop(service_id, None)
if not instance:
logger.warning(f"[🗂️] Service {service_id} not found")
return False
operations_to_remove = [
op_key for op_key in self.service_operations.keys()
if op_key.startswith(f"{service_id}.")
]
for op_key in operations_to_remove:
self.service_operations.pop(op_key, None)
self.service_dependencies.pop(service_id, None)
logger.info(f"[🗂️] Deregistered service: {service_id}")
return True
def update_service_heartbeat(self, service_id: str, health_data: Optional[Dict[str, Any]] = None) -> bool:
"""Update service heartbeat and health data"""
instance = self.services.get(service_id)
if not instance:
return False
instance.last_heartbeat = datetime.utcnow()
if health_data:
instance.health_data = health_data
if health_data and 'status' in health_data:
try:
instance.status = ServiceStatus(health_data['status'])
except ValueError:
instance.status = ServiceStatus.UNKNOWN
elif instance.status == ServiceStatus.REGISTERING:
instance.status = ServiceStatus.HEALTHY
return True
def get_service(self, service_id: str) -> Optional[ServiceInstance]:
"""Get service instance by ID"""
return self.services.get(service_id)
def find_service_operation(self, service_id: str, operation_id: str) -> Optional[ServiceOperation]:
"""Find a specific service operation"""
return self.service_operations.get(f"{service_id}.{operation_id}")
def list_services(self, status_filter: Optional[ServiceStatus] = None) -> List[ServiceInstance]:
"""List all services, optionally filtered by status"""
services = list(self.services.values())
if status_filter:
services = [s for s in services if s.status == status_filter]
return services
def list_service_operations(self, service_id: Optional[str] = None) -> List[ServiceOperation]:
"""List all service operations"""
operations = list(self.service_operations.values())
if service_id:
operations = [
op for op_key, op in self.service_operations.items()
if op_key.startswith(f"{service_id}.")
]
return operations
def check_dependencies(self, service_id: str) -> Dict[str, bool]:
"""Check if service dependencies are available"""
dependencies = self.service_dependencies.get(service_id, set())
result = {}
for dep_service_id in dependencies:
dep_instance = self.services.get(dep_service_id)
result[dep_service_id] = (
dep_instance is not None and
dep_instance.status in [ServiceStatus.HEALTHY, ServiceStatus.DEGRADED]
)
return result
async def _health_monitor_loop(self):
"""Background task to monitor service health"""
while self._running:
try:
await self._check_service_health()
await asyncio.sleep(self.health_check_interval)
except asyncio.CancelledError:
break
except Exception as e:
logger.exception(f"[🗂️] Health monitor error: {e}")
await asyncio.sleep(5)
async def _check_service_health(self):
"""Check health of all registered services"""
current_time = datetime.utcnow()
for service_id, instance in self.services.items():
heartbeat_age = (current_time - instance.last_heartbeat).total_seconds()
if heartbeat_age > self.heartbeat_timeout:
if instance.status != ServiceStatus.OFFLINE:
logger.warning(f"[🗂️] Service {service_id} heartbeat timeout ({heartbeat_age}s)")
instance.status = ServiceStatus.OFFLINE
if self.event_bus:
await self.event_bus.emit("vi.services.health.status", {
"service_id": service_id,
"instance_id": instance.instance_id,
"status": instance.status.value,
"last_heartbeat": instance.last_heartbeat.isoformat(),
"health_data": instance.health_data
})
# NATS Event Handlers
async def _handle_service_register(self, msg):
"""Handle service registration requests"""
try:
if hasattr(msg, 'data'):
payload = json.loads(msg.data.decode())
else:
payload = msg if isinstance(msg, dict) else json.loads(msg)
service_id = payload.get('service_id')
manifest_data = payload.get('manifest', {})
instance_id = payload.get('instance_id')
if not service_id:
logger.error("[🗂️] Registration error: Missing service_id")
return
manifest_data.pop('service_id', None)
operations_data = manifest_data.pop('operations', [])
operations = []
for op_data in operations_data:
if isinstance(op_data, dict):
operation = ServiceOperation(
operation_id=op_data.get('operation_id', ''),
description=op_data.get('description', ''),
request_topic=op_data.get('request_topic', ''),
response_pattern=op_data.get('response_pattern', 'request-reply'),
parameters=op_data.get('parameters', []),
timeout_ms=op_data.get('timeout_ms', 5000),
metadata=op_data.get('metadata', {})
)
operations.append(operation)
else:
operations.append(op_data)
manifest = ServiceManifest(service_id=service_id, operations=operations, **manifest_data)
actual_instance_id = self.register_service(service_id, manifest, instance_id)
logger.info(f"[🗂️] ✅ Service registered: {service_id}")
except Exception as e:
logger.exception(f"[🗂️] Registration error: {e}")
async def _handle_service_deregister(self, msg):
"""Handle service deregistration requests"""
try:
if hasattr(msg, 'data'):
payload = json.loads(msg.data.decode())
else:
payload = msg if isinstance(msg, dict) else json.loads(msg)
service_id = payload.get('service_id')
if service_id:
self.deregister_service(service_id)
except Exception as e:
logger.exception(f"[🗂️] Deregistration error: {e}")
async def _handle_service_heartbeat(self, msg):
"""Handle service heartbeat updates"""
try:
if hasattr(msg, 'data'):
payload = json.loads(msg.data.decode())
else:
payload = msg if isinstance(msg, dict) else json.loads(msg)
service_id = payload.get('service_id')
instance_id = payload.get('instance_id')
health_data = payload.get('health_data')
if not service_id:
if hasattr(msg, 'respond'):
await msg.respond(json.dumps({'acknowledged': False}).encode())
return
success = self.update_service_heartbeat(service_id, health_data)
response = {
'acknowledged': success,
'service_id': service_id,
'instance_id': instance_id,
'timestamp': datetime.utcnow().isoformat()
}
if hasattr(msg, 'respond'):
await msg.respond(json.dumps(response).encode())
except Exception as e:
logger.exception(f"[🗂️] Heartbeat error: {e}")
async def _handle_service_discover(self, msg):
"""Handle service discovery requests"""
try:
if hasattr(msg, 'data'):
payload = json.loads(msg.data.decode())
else:
payload = msg if isinstance(msg, dict) else json.loads(msg)
service_id = payload.get('service_id')
operation_id = payload.get('operation_id')
if service_id and operation_id:
operation = self.find_service_operation(service_id, operation_id)
result = operation.__dict__ if operation and hasattr(operation, '__dict__') else operation
elif service_id:
instance = self.get_service(service_id)
result = {
'service_id': instance.service_id,
'instance_id': instance.instance_id,
'status': instance.status.value,
'manifest': self._serialize_manifest(instance.manifest),
'last_heartbeat': instance.last_heartbeat.isoformat()
} if instance else None
else:
services = self.list_services()
result = [
{
'service_id': s.service_id,
'status': s.status.value,
'last_heartbeat': s.last_heartbeat.isoformat()
}
for s in services
]
await msg.respond(json.dumps({'result': result}).encode())
except Exception as e:
logger.exception(f"[🗂️] Discovery error: {e}")
await msg.respond(json.dumps({'error': str(e)}).encode())
async def _handle_service_list(self, msg):
"""Handle service listing requests"""
try:
if hasattr(msg, 'data'):
payload = json.loads(msg.data.decode())
else:
payload = msg if isinstance(msg, dict) else json.loads(msg)
status_filter = payload.get('status_filter')
status_enum = ServiceStatus(status_filter) if status_filter else None
services = self.list_services(status_enum)
result = [
{
'service_id': s.service_id,
'status': s.status.value,
'name': s.manifest.name,
'version': s.manifest.version,
'last_heartbeat': s.last_heartbeat.isoformat()
}
for s in services
]
await msg.respond(json.dumps({'services': result}).encode())
except Exception as e:
logger.exception(f"[🗂️] List error: {e}")
await msg.respond(json.dumps({'error': str(e)}).encode())
async def _handle_health_request(self, msg):
"""Handle health status requests"""
try:
if hasattr(msg, 'data'):
payload = json.loads(msg.data.decode())
else:
payload = msg if isinstance(msg, dict) else json.loads(msg)
service_id = payload.get('service_id')
if service_id:
instance = self.get_service(service_id)
result = {
'service_id': service_id,
'status': instance.status.value,
'last_heartbeat': instance.last_heartbeat.isoformat(),
'health_data': instance.health_data
} if instance else None
else:
result = {
sid: {
'status': inst.status.value,
'last_heartbeat': inst.last_heartbeat.isoformat(),
'health_data': inst.health_data
}
for sid, inst in self.services.items()
}
await msg.respond(json.dumps({'health': result}).encode())
except Exception as e:
logger.exception(f"[🗂️] Health request error: {e}")
await msg.respond(json.dumps({'error': str(e)}).encode())
service_registry = ServiceRegistry()