FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS builder WORKDIR /build RUN apt-get update && apt-get install -y python3.10 python3-pip python3-dev cmake git COPY requirements/inference.txt . RUN pip install --no-cache-dir --upgrade pip setuptools wheel RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH ENV CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off" ENV FORCE_CMAKE=1 RUN pip install --no-cache-dir llama-cpp-python RUN pip install --no-cache-dir -r inference.txt FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 WORKDIR /app RUN apt-get update && apt-get install -y python3.10 python3-pip && \ rm -rf /var/lib/apt/lists/* && \ ln -sf /usr/bin/python3 /usr/bin/python COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages COPY --from=builder /usr/local/bin /usr/local/bin COPY . . ENV PYTHONUNBUFFERED=1 ENV PYTHONPATH=/app EXPOSE 8001 CMD ["python", "-m", "uvicorn", "gpu_server:app", "--host", "0.0.0.0", "--port", "8001"]