FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS builder
WORKDIR /build

RUN apt-get update && apt-get install -y python3.10 python3-pip python3-dev cmake git

COPY requirements/inference.txt .

RUN pip install --no-cache-dir --upgrade pip setuptools wheel
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 

ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH

ENV CMAKE_ARGS="-DGGML_CUDA=on -DLLAVA_BUILD=off"
ENV FORCE_CMAKE=1

RUN pip install --no-cache-dir llama-cpp-python
RUN pip install --no-cache-dir -r inference.txt

FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04
WORKDIR /app

RUN apt-get update && apt-get install -y python3.10 python3-pip && \
    rm -rf /var/lib/apt/lists/* && \
    ln -sf /usr/bin/python3 /usr/bin/python

COPY --from=builder /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
COPY --from=builder /usr/local/bin /usr/local/bin

COPY gpu_server.py .

ENV PYTHONUNBUFFERED=1
ENV PYTHONPATH=/app 
EXPOSE 8001

CMD ["python", "-m", "uvicorn", "gpu_server:app", "--host", "0.0.0.0", "--port", "8001"]