# Ollama container for running via GTAP — no CUDA distribution required.
#
# The GGML CUDA kernels are built from source with native SASS for the
# target architecture to avoid expensive PTX JIT. All other CUDA libraries
# (cuBLAS, cuBLASLt, cudart) are stripped because GTAP intercepts them
# at the linker level.
#
# Build:
#   docker build --build-arg CMAKE_CUDA_ARCHITECTURES=89 -t ollama-demo .
#
# Run (via gtap):
#   gtap docker run --rm ollama-demo

# ---------------------------------------------------------------------------
# Build libggml-cuda.so with native SASS
# ---------------------------------------------------------------------------
FROM nvidia/cuda:13.0.1-devel-ubuntu24.04 AS ggml-cuda-build

ARG CMAKE_CUDA_ARCHITECTURES=121
ARG OLLAMA_VERSION=v0.17.7
ARG PARALLEL=8

RUN apt-get update && \
    apt-get install -y --no-install-recommends git cmake build-essential && \
    rm -rf /var/lib/apt/lists/*

RUN git clone --depth 1 --branch ${OLLAMA_VERSION} \
    https://github.com/ollama/ollama.git /ollama-src

WORKDIR /ollama-src
RUN cmake --preset 'CUDA 13' \
      -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHITECTURES}" \
      -DCMAKE_INSTALL_PREFIX=/ollama-dist && \
    cmake --build --preset 'CUDA 13' --parallel ${PARALLEL} && \
    cmake --install build --component CUDA --strip

# ---------------------------------------------------------------------------
# Runtime stage — no CUDA distribution
# ---------------------------------------------------------------------------
FROM ubuntu:24.04

ARG TARGETARCH
ARG OLLAMA_VERSION=v0.17.7

RUN apt-get update && \
    apt-get install -y --no-install-recommends curl ca-certificates zstd && \
    rm -rf /var/lib/apt/lists/*

# Install the Ollama release, then strip all bundled CUDA libraries.
# GTAP intercepts cuBLAS, cuBLASLt, and cudart via LD_AUDIT, so they
# are not needed in the image.
RUN mkdir -p /ollama && cd /ollama && \
    curl -L "https://github.com/ollama/ollama/releases/download/${OLLAMA_VERSION}/ollama-linux-${TARGETARCH}.tar.zst" \
      -o ollama.tar.zst && \
    tar --use-compress-program=unzstd -xf ollama.tar.zst && \
    rm ollama.tar.zst && \
    rm -rf lib/ollama/cuda_v12 && \
    rm -f lib/ollama/cuda_v13/libcublas* lib/ollama/cuda_v13/libcublasLt* lib/ollama/cuda_v13/libcudart*

# Replace PTX-only libggml-cuda.so with native SASS build
COPY --from=ggml-cuda-build \
    /ollama-dist/lib/ollama/cuda_v13/libggml-cuda.so \
    /ollama/lib/ollama/cuda_v13/libggml-cuda.so

ENV OLLAMA_PATH="/ollama"
ENV PATH="$OLLAMA_PATH/bin:$PATH"
ENV LD_LIBRARY_PATH="$OLLAMA_PATH"
ENV OLLAMA_KEEP_ALIVE="-1"
ENV OLLAMA_HOST="0.0.0.0:11434"
ENV GGML_CUDA_NO_PINNED=1

# Entrypoint: start ollama serve in the background, wait for readiness,
# then run the command.
RUN cat <<'ENTRYPOINT' > /entrypoint.sh && chmod +x /entrypoint.sh
#!/bin/sh
ollama serve >/var/log/ollama.log 2>&1 &
until curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; do
  sleep 0.1
done
exec "$@"
ENTRYPOINT

WORKDIR /ollama

ENTRYPOINT ["/entrypoint.sh"]
CMD ["ollama", "serve"]