# VM 5702 — Inference: llama.cpp server (CPU-friendly) # Copy to /opt/ai/inference/ and place model at /opt/ai/inference/data/models/model.gguf # See: docs/02-architecture/AI_AGENTS_57XX_DEPLOYMENT_PLAN.md Appendix D services: llama: image: ghcr.io/ggerganov/llama.cpp:server container_name: ai-inf-prod volumes: - /opt/ai/inference/data/models:/models command: > -m /models/model.gguf --host 0.0.0.0 --port 8000 --n-gpu-layers 0 --ctx-size 4096 ports: - "8000:8000" restart: unless-stopped