Self-hosted inference with VLLM engine, CUDA acceleration and Docker containerization for local AI deployment.
Demonstrates capabilities:
├── docker-compose.yaml # Docker services
├── model/ # Model storage
├── src/
│ ├── repl.ts # REPL interface
│ └── lib/
│ └── swarm.ts # Swarm configuration
└── index.ts # Entry point
# Ubuntu/Debian
sudo apt install nvidia-container-runtime
# Restart Docker
sudo systemctl restart docker
# Clone and setup
git clone <repository>
cd vllm-docker-cuda
# Start VLLM server
docker-compose up -d vllm-server
# Check status
docker-compose logs vllm-server
# Install dependencies
bun install
# Start client
bun run src/repl.ts
services:
vllm-server:
image: vllm/vllm-openai:latest
runtime: nvidia
environment:
- NVIDIA_VISIBLE_DEVICES=all
command: >
--model microsoft/DialoGPT-medium
--host 0.0.0.0
--port 8000
ports:
- "8000:8000"
VLLM_API_URL=http://localhost:8000/v1
VLLM_MODEL=microsoft/DialoGPT-medium
CUDA_VISIBLE_DEVICES=0
# Replace model in docker-compose.yaml
--model microsoft/DialoGPT-medium
# with
--model meta-llama/Llama-2-7b-chat-hf
# GPU monitoring
nvidia-smi
# Memory usage
docker stats vllm-server
command: >
--model microsoft/DialoGPT-medium
--tensor-parallel-size 1
--max-num-batched-tokens 4096
--max-num-seqs 256
# Chat completions
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "microsoft/DialoGPT-medium",
"messages": [{"role": "user", "content": "Hello"}]
}'
# Available models
curl http://localhost:8000/v1/models
Ideal for:
# Container health
docker-compose ps
# Logs
docker-compose logs -f vllm-server
# GPU utilization
watch nvidia-smi
# API health check
curl http://localhost:8000/health