1
- FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
1
+ FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
2
2
3
3
# Set environment variables
4
4
ENV DEBIAN_FRONTEND=noninteractive
@@ -17,22 +17,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
17
17
ninja-build \
18
18
&& rm -rf /var/lib/apt/lists/*
19
19
20
- # Install NVIDIA driver libraries needed for libcuda.so and libnvidia-ml.so
21
- RUN apt-get update && apt-get install -y --no-install-recommends \
22
- nvidia-driver-525 \
23
- && rm -rf /var/lib/apt/lists/*
20
+ # IMPORTANT: Don't install NVIDIA drivers - fly.io provides these
21
+ # The A10 GPUs on fly.io already have the proper drivers
24
22
25
23
# Set up working directory
26
24
WORKDIR /app
27
25
RUN mkdir -p ./data
28
26
29
- # Install Python packages - use the pre-installed PyTorch
27
+ # Install Python packages
30
28
RUN pip install --no-cache-dir --upgrade pip && \
31
29
pip install --no-cache-dir "transformers==4.48.3" && \
32
30
pip install --no-cache-dir "sglang[all]>=0.4.2.post4" && \
33
31
pip install --no-cache-dir jupyter pandas tqdm nvitop scikit-learn seaborn matplotlib faiss-gpu faiss-cpu bitsandbytes && \
34
- # pip install --no-cache-dir flash-attn --no-build-isolation && \
35
- # pip install --no-cache-dir autoawq --no-build-isolation && \
36
32
pip install --no-cache-dir accelerate
37
33
38
34
# Copy source code
@@ -41,8 +37,18 @@ COPY . .
41
37
# Add GPU check script
42
38
COPY ./src_deploy/check_gpu.py /app/check_gpu.py
43
39
40
+ # Setup proper NVIDIA configurations
41
+ ENV NVIDIA_VISIBLE_DEVICES=all
42
+ ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
43
+
44
44
# Run ldconfig to update library cache
45
45
RUN ldconfig
46
46
47
- CMD ["sh" , "-c" , "python /app/check_gpu.py && python -m sglang.launch_server --model-path microsoft/Phi-3.5-mini-instruct --host 0.0.0.0 --port 8899 --api-key None --mem-fraction-static 0.9 --max-running-requests 1024 --attention-backend triton --disable-cuda-graph --dtype float16 --chunked-prefill-size 512 --enable-metrics --show-time-cost --enable-cache-report --log-level info --watchdog-timeout 120 --schedule-policy lpm --schedule-conservativeness 0.8" ]
47
+ # Use a healthcheck to ensure GPU is available
48
+ HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
49
+ CMD python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'"
50
+
51
+ # Expose port for the service
52
+ EXPOSE 8899
48
53
54
+ CMD ["sh" , "-c" , "python /app/check_gpu.py && python -m sglang.launch_server --model-path microsoft/Phi-3.5-mini-instruct --host 0.0.0.0 --port 8899 --api-key None --mem-fraction-static 0.9 --max-running-requests 1024 --attention-backend triton --disable-cuda-graph --dtype float16 --chunked-prefill-size 512 --enable-metrics --show-time-cost --enable-cache-report --log-level info --watchdog-timeout 120 --schedule-policy lpm --schedule-conservativeness 0.8" ]
0 commit comments