# Ollama container for running via GTAP — no CUDA distribution required. # # The GGML CUDA kernels are built from source with native SASS for the # target architecture to avoid expensive PTX JIT. All other CUDA libraries # (cuBLAS, cuBLASLt, cudart) are stripped because GTAP intercepts them # at the linker level. # # Build: # docker build --build-arg CMAKE_CUDA_ARCHITECTURES=89 -t ollama-demo . # # Run (via gtap): # gtap docker run --rm ollama-demo # --------------------------------------------------------------------------- # Build libggml-cuda.so with native SASS # --------------------------------------------------------------------------- FROM nvidia/cuda:13.0.1-devel-ubuntu24.04 AS ggml-cuda-build ARG CMAKE_CUDA_ARCHITECTURES=121 ARG OLLAMA_VERSION=v0.17.7 ARG PARALLEL=8 RUN apt-get update && \ apt-get install -y --no-install-recommends git cmake build-essential && \ rm -rf /var/lib/apt/lists/* RUN git clone --depth 1 --branch ${OLLAMA_VERSION} \ https://github.com/ollama/ollama.git /ollama-src WORKDIR /ollama-src RUN cmake --preset 'CUDA 13' \ -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHITECTURES}" \ -DCMAKE_INSTALL_PREFIX=/ollama-dist && \ cmake --build --preset 'CUDA 13' --parallel ${PARALLEL} && \ cmake --install build --component CUDA --strip # --------------------------------------------------------------------------- # Runtime stage — no CUDA distribution # --------------------------------------------------------------------------- FROM ubuntu:24.04 ARG TARGETARCH ARG OLLAMA_VERSION=v0.17.7 RUN apt-get update && \ apt-get install -y --no-install-recommends curl ca-certificates zstd && \ rm -rf /var/lib/apt/lists/* # Install the Ollama release, then strip all bundled CUDA libraries. # GTAP intercepts cuBLAS, cuBLASLt, and cudart via LD_AUDIT, so they # are not needed in the image. RUN mkdir -p /ollama && cd /ollama && \ curl -L "https://github.com/ollama/ollama/releases/download/${OLLAMA_VERSION}/ollama-linux-${TARGETARCH}.tar.zst" \ -o ollama.tar.zst && \ tar --use-compress-program=unzstd -xf ollama.tar.zst && \ rm ollama.tar.zst && \ rm -rf lib/ollama/cuda_v12 && \ rm -f lib/ollama/cuda_v13/libcublas* lib/ollama/cuda_v13/libcublasLt* lib/ollama/cuda_v13/libcudart* # Replace PTX-only libggml-cuda.so with native SASS build COPY --from=ggml-cuda-build \ /ollama-dist/lib/ollama/cuda_v13/libggml-cuda.so \ /ollama/lib/ollama/cuda_v13/libggml-cuda.so ENV OLLAMA_PATH="/ollama" ENV PATH="$OLLAMA_PATH/bin:$PATH" ENV LD_LIBRARY_PATH="$OLLAMA_PATH" ENV OLLAMA_KEEP_ALIVE="-1" ENV OLLAMA_HOST="0.0.0.0:11434" ENV GGML_CUDA_NO_PINNED=1 # Entrypoint: start ollama serve in the background, wait for readiness, # then run the command. RUN cat <<'ENTRYPOINT' > /entrypoint.sh && chmod +x /entrypoint.sh #!/bin/sh ollama serve >/var/log/ollama.log 2>&1 & until curl -sf http://localhost:11434/api/tags >/dev/null 2>&1; do sleep 0.1 done exec "$@" ENTRYPOINT WORKDIR /ollama ENTRYPOINT ["/entrypoint.sh"] CMD ["ollama", "serve"]