[update] workflow deploy issues

sagar-sarkale-yral · sagar-sarkale-yral · commit 80278a84bdb1 · 2025-03-23T14:52:35.000+05:30
diff --git a/src_deploy/gpu.Dockerfile b/src_deploy/gpu.Dockerfile
@@ -1,4 +1,4 @@
-FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
+FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
 
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
@@ -17,22 +17,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ninja-build \
     && rm -rf /var/lib/apt/lists/*
 
-# Install NVIDIA driver libraries needed for libcuda.so and libnvidia-ml.so
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    nvidia-driver-525 \
-    && rm -rf /var/lib/apt/lists/*
+# IMPORTANT: Don't install NVIDIA drivers - fly.io provides these
+# The A10 GPUs on fly.io already have the proper drivers
 
 # Set up working directory
 WORKDIR /app
 RUN mkdir -p ./data
 
-# Install Python packages - use the pre-installed PyTorch
+# Install Python packages
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir "transformers==4.48.3" && \
     pip install --no-cache-dir "sglang[all]>=0.4.2.post4" && \
     pip install --no-cache-dir jupyter pandas tqdm nvitop scikit-learn seaborn matplotlib faiss-gpu faiss-cpu bitsandbytes && \
-    # pip install --no-cache-dir flash-attn --no-build-isolation && \
-    # pip install --no-cache-dir autoawq --no-build-isolation && \
     pip install --no-cache-dir accelerate
 
 # Copy source code
@@ -41,8 +37,18 @@ COPY . .
 # Add GPU check script
 COPY ./src_deploy/check_gpu.py /app/check_gpu.py
 
+# Setup proper NVIDIA configurations
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
 # Run ldconfig to update library cache
 RUN ldconfig
 
-CMD ["sh", "-c", "python /app/check_gpu.py && python -m sglang.launch_server --model-path microsoft/Phi-3.5-mini-instruct --host 0.0.0.0 --port 8899 --api-key None --mem-fraction-static 0.9 --max-running-requests 1024 --attention-backend triton --disable-cuda-graph --dtype float16 --chunked-prefill-size 512 --enable-metrics --show-time-cost --enable-cache-report --log-level info --watchdog-timeout 120 --schedule-policy lpm --schedule-conservativeness 0.8"]
+# Use a healthcheck to ensure GPU is available
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'"
+
+# Expose port for the service
+EXPOSE 8899
 
+CMD ["sh", "-c", "python /app/check_gpu.py && python -m sglang.launch_server --model-path microsoft/Phi-3.5-mini-instruct --host 0.0.0.0 --port 8899 --api-key None --mem-fraction-static 0.9 --max-running-requests 1024 --attention-backend triton --disable-cuda-graph --dtype float16 --chunked-prefill-size 512 --enable-metrics --show-time-cost --enable-cache-report --log-level info --watchdog-timeout 120 --schedule-policy lpm --schedule-conservativeness 0.8"]
diff --git a/src_deploy/gpu.fly.toml b/src_deploy/gpu.fly.toml
@@ -10,7 +10,7 @@ primary_region = 'ord'
   dockerfile = 'gpu.Dockerfile'
 
 [http_service]
-  internal_port = 8080
+  internal_port = 8899
   force_https = true
   auto_stop_machines = 'stop'
   auto_start_machines = true