# Build llama.cpp server for AMD64 (Windows/Linux x86_64) with Vision Models
# Supports both Llama 3.2 Vision (2B/11B) and Qwen2.5-VL (2B/7B) for image understanding
FROM ubuntu:22.04 AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
git \
curl \
libcurl4-openssl-dev \
&& rm -rf /var/lib/apt/lists/*
# Clone llama.cpp repository
WORKDIR /build
RUN git clone https://github.com/ggerganov/llama.cpp.git .
# Build llama.cpp server with multimodal support
RUN mkdir build && cd build && \
cmake .. \
-DLLAMA_CURL=ON \
-DLLAMA_BUILD_SERVER=ON \
-DBUILD_SHARED_LIBS=OFF \
-DCMAKE_BUILD_TYPE=Release && \
cmake --build . --config Release -j$(nproc)
# Runtime stage
FROM ubuntu:22.04
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
libcurl4 \
curl \
libgomp1 \
&& rm -rf /var/lib/apt/lists/*
# Copy built binaries
COPY --from=builder /build/build/bin/llama-server /usr/local/bin/llama-server
# Create models directory
RUN mkdir -p /models
# Copy vision model (single model per image)
ARG MODEL_FILE
ARG MMPROJ_FILE
COPY docker/llama-cpp/models/${MODEL_FILE} /models/model.gguf
COPY docker/llama-cpp/models/${MMPROJ_FILE} /models/mmproj.gguf
# Expose port
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1
# Set working directory
WORKDIR /app
# Run llama-server with vision capabilities
# --mmproj is required for vision understanding
ENTRYPOINT ["/usr/local/bin/llama-server"]
CMD ["--host", "0.0.0.0", "--port", "8080", "--model", "/models/model.gguf", "--mmproj", "/models/mmproj.gguf", "--ctx-size", "32768", "--parallel", "4", "--threads", "-1"]