Doradus AI commited on
Commit
be2f435
·
verified ·
1 Parent(s): 81b6ca6

Upload docker/docker-compose.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. docker/docker-compose.yml +64 -0
docker/docker-compose.yml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MiroThinker-v1.0-30B-FP8 Docker Compose
2
+ #
3
+ # Usage (TP=2, recommended):
4
+ # docker compose up
5
+ #
6
+ # Usage (single GPU, not recommended):
7
+ # SINGLE_GPU=1 docker compose up
8
+
9
+ services:
10
+ mirothinker:
11
+ image: vllm/vllm-openai:v0.11.2
12
+ ports:
13
+ - "8000:8000"
14
+ environment:
15
+ - NVIDIA_VISIBLE_DEVICES=all
16
+ - HF_HOME=/root/.cache/huggingface
17
+ # Set SINGLE_GPU=1 for single GPU mode (poor performance)
18
+ - SINGLE_GPU=${SINGLE_GPU:-}
19
+ volumes:
20
+ # Cache downloaded models
21
+ - hf_cache:/root/.cache/huggingface
22
+ deploy:
23
+ resources:
24
+ reservations:
25
+ devices:
26
+ - driver: nvidia
27
+ count: all
28
+ capabilities: [gpu]
29
+ shm_size: '16gb'
30
+ command: >
31
+ sh -c '
32
+ if [ -n "$SINGLE_GPU" ]; then
33
+ echo "WARNING: Single GPU mode - expect ~1-2 tok/s, 2K context max";
34
+ python -m vllm.entrypoints.openai.api_server
35
+ --model Doradus/MiroThinker-v1.0-30B-FP8
36
+ --host 0.0.0.0
37
+ --port 8000
38
+ --tensor-parallel-size 1
39
+ --max-model-len 2048
40
+ --max-num-seqs 4
41
+ --gpu-memory-utilization 0.95
42
+ --enforce-eager
43
+ --trust-remote-code;
44
+ else
45
+ python -m vllm.entrypoints.openai.api_server
46
+ --model Doradus/MiroThinker-v1.0-30B-FP8
47
+ --host 0.0.0.0
48
+ --port 8000
49
+ --tensor-parallel-size 2
50
+ --max-model-len 32768
51
+ --gpu-memory-utilization 0.90
52
+ --trust-remote-code
53
+ --enable-chunked-prefill;
54
+ fi
55
+ '
56
+ healthcheck:
57
+ test: ["CMD", "curl", "-sf", "http://localhost:8000/health"]
58
+ interval: 30s
59
+ timeout: 10s
60
+ retries: 3
61
+ start_period: 120s
62
+
63
+ volumes:
64
+ hf_cache: