""" Model loading for vLLM backend. """ import os from typing import Optional, Dict, Any from core.logger import setup_logger logger = setup_logger('model_loader', service_name='oracle_service') class ModelLoader: """Handles model loading with vLLM""" def __init__(self, model_path: str = None): if model_path is None: model_path = os.getenv('ORACLE_MODEL_PATH', "/data/models/Qwen3-32B-GPTQ-Int4") self.model_path = model_path if model_path.startswith(('/', './', '../')): self.model_cache_dir = os.path.dirname(model_path) else: self.model_cache_dir = os.getenv('HF_HOME', "/data/models") self.llm = None self.model_name: Optional[str] = None self.backend_type: Optional[str] = None self.is_loaded = False async def load_model(self, model_path: Optional[str] = None) -> bool: """Load model using vLLM""" try: target_model = model_path or self.model_path logger.info(f"[✺] Loading model: {target_model}") if self.is_loaded and self.model_name == target_model: logger.info("[✺] Model already loaded") return True if not self._validate_model_path(target_model): logger.error(f"[✺] Model not found: {target_model}") return False return await self._load_vllm(target_model) except Exception as e: logger.exception(f"[✺] Failed to load model: {e}") self.is_loaded = False return False def _validate_model_path(self, model_path: str) -> bool: """Check if model exists locally""" if not model_path.startswith('/'): return False if not os.path.exists(model_path) or not os.path.isdir(model_path): return False # Check for config.json if not os.path.exists(os.path.join(model_path, 'config.json')): return False # Check for model files has_model = ( os.path.exists(os.path.join(model_path, 'model.safetensors')) or os.path.exists(os.path.join(model_path, 'pytorch_model.bin')) or os.path.exists(os.path.join(model_path, 'model.safetensors.index.json')) ) return has_model async def _load_vllm(self, model_path: str) -> bool: """Load with vLLM""" try: from vllm import LLM import torch if not torch.cuda.is_available(): logger.warning("[✺] CUDA not available") return False gpu_name = torch.cuda.get_device_name(0) gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3 logger.info(f"[✺] GPU: {gpu_name}, {gpu_memory:.1f}GB") is_gptq = 'GPTQ' in model_path or 'gptq' in model_path.lower() max_context = int(os.getenv("MAX_MODEL_LEN", "8192")) vllm_config = { "model": model_path, "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", "0.90")), "max_model_len": max_context, "tensor_parallel_size": 1, "trust_remote_code": False, "download_dir": self.model_cache_dir, "disable_log_stats": True, "max_num_seqs": 16, "swap_space": 4, } if is_gptq: vllm_config["dtype"] = "float16" vllm_config["quantization"] = "gptq_marlin" logger.info("[✺] GPTQ mode enabled") logger.info(f"[✺] Initializing vLLM - max_context: {max_context}") self.llm = LLM(**vllm_config) self.backend_type = "vllm" self.model_name = model_path self.is_loaded = True logger.info(f"[✺] Model loaded: {model_path}") return True except Exception as e: logger.error(f"[✺] vLLM failed: {e}") return False async def unload_model(self): """Unload model""" try: if self.llm is not None: del self.llm self.llm = None self.model_name = None self.backend_type = None self.is_loaded = False try: import torch if torch.cuda.is_available(): torch.cuda.empty_cache() except ImportError: pass logger.info("[✺] Model unloaded") except Exception as e: logger.exception(f"[✺] Unload error: {e}") def get_model_info(self) -> Dict[str, Any]: """Get model info""" info = { "model_name": self.model_name, "model_path": self.model_path, "backend_type": self.backend_type, "is_loaded": self.is_loaded, } try: import torch if torch.cuda.is_available(): info["gpu_available"] = True info["gpu_memory_total"] = torch.cuda.get_device_properties(0).total_memory / 1024**3 info["gpu_memory_reserved"] = torch.cuda.memory_reserved(0) / 1024**3 else: info["gpu_available"] = False except ImportError: info["gpu_available"] = False return info