diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 2b593144b..c17bf2cd9 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -24,7 +24,7 @@ jobs: strategy: matrix: flavor: ["base", "devel", "devel-efa"] - ubuntu_version: ["22"] + ubuntu_version: ["24"] steps: - name: Checkout repository uses: actions/checkout@v4 @@ -45,18 +45,11 @@ jobs: df -h / - name: Build and upload to DockerHub run: | - if [ "${{ matrix.flavor }}" = "base" ]; then - FILE="base/Dockerfile" - elif [ "${{ matrix.flavor }}" = "devel" ]; then - FILE="base/Dockerfile" - else - FILE="base/efa/Dockerfile" - fi docker buildx build \ --platform linux/amd64 \ + --target ${{ matrix.flavor }} \ --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \ - --build-arg FLAVOR=${{ matrix.flavor }} \ --build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \ --provenance=false \ --push \ - -f $FILE . + -f base/Dockerfile . diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 29e92f45b..6c01f2e94 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,17 +1,72 @@ -# syntax = edrevo/dockerfile-plus ARG UBUNTU_VERSION -# Build stage -FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder +# ============================================================================ +# common: shared base for all flavors. Select a flavor with `--target ` +# (base / devel / devel-efa). +# ============================================================================ + +FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS common + +# ARGs before FROM must be redeclared to be used after FROM +ARG UBUNTU_VERSION + +ARG _UV_HOME="/opt/uv" + +ENV UV_INSTALL_DIR="${_UV_HOME}/bin" +ENV UV_MANAGED_PYTHON=1 +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +ENV PATH="${UV_INSTALL_DIR}:${PATH}" + +ENV OMPI_MCA_pml=^cm,ucx +ENV OMPI_MCA_btl=tcp,self +ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 +ENV NCCL_SOCKET_IFNAME=^docker,lo + +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 openssh-server wget \ + libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ + # nvidia/cuda ships nvidia's apt singing key in legacy format (/etc/apt/trusted.gpg). + # This lead to warnings, so we install cuda-keyring. + && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-keyring_1.1-1_all.deb \ + && dpkg -i cuda-keyring_1.1-1_all.deb \ + && rm cuda-keyring_1.1-1_all.deb \ + && rm -f /etc/apt/sources.list.d/cuda.list \ + && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ + && mkdir /run/sshd \ + && mkdir -p ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ + && chmod 600 ~/.ssh/authorized_keys \ + && rm /etc/ssh/ssh_host_* \ + # User: UID/GID 1001 because Ubuntu 24.04 ships a default 'ubuntu' user at 1000. + && apt-get install -y sudo \ + && groupadd -g 1001 dstack \ + && useradd -u 1001 -g 1001 -G sudo -s /bin/bash -m dstack \ + && echo 'dstack ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/dstack \ + # Default working dir + && mkdir -p /dstack/run \ + && chmod a+rwx /dstack/run \ + # Cleanup + && rm -rf /var/lib/apt/lists/* + +RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \ + && uv python install --preview --default + +# ============================================================================ +# builder: builds NCCL and nccl-tests from source for the base/devel flavors. +# ============================================================================ + +FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS builder ENV NCCL_HOME=/opt/nccl ENV CUDA_HOME=/usr/local/cuda ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi -# Prerequisites - RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ && apt-get update --fix-missing \ && apt-get upgrade -y \ && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ @@ -31,8 +86,6 @@ RUN export DEBIAN_FRONTEND=noninteractive \ python3 \ build-essential -# NCCL - ARG NCCL_VERSION=2.26.2-1 RUN cd /tmp \ @@ -40,8 +93,6 @@ RUN cd /tmp \ && cd nccl \ && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} -# NCCL tests - RUN cd /opt \ && git clone https://github.com/NVIDIA/nccl-tests \ && cd nccl-tests \ @@ -51,31 +102,86 @@ RUN cd /opt \ CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} -# Final stage +# ============================================================================ +# base: common + NCCL (from builder) + OpenMPI. +# ============================================================================ -INCLUDE+ base/Dockerfile.common +FROM common AS base ENV NCCL_HOME=/opt/nccl COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build -ARG FLAVOR - -# MPI, NVCC, and /etc/ld.so.conf.d - RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - openmpi-bin \ - && if [ "$FLAVOR" = "devel" ]; then \ - cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev; \ - fi \ + && apt-get install -y --no-install-recommends openmpi-bin \ && rm -rf /var/lib/apt/lists/* \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && ldconfig WORKDIR /dstack/run + +# ============================================================================ +# devel: base + CUDA development libraries and NVCC. +# ============================================================================ + +FROM base AS devel + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /dstack/run + +# ============================================================================ +# devel-efa: common + CUDA dev libraries + AWS EFA + EFA-aware NCCL. +# ============================================================================ + +FROM common AS devel-efa + +ENV NCCL_HOME=/usr/local +ENV CUDA_HOME=/usr/local/cuda +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV OPEN_MPI_PATH=/opt/amazon/openmpi +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + && rm -rf /var/lib/apt/lists/* + +ARG EFA_VERSION=1.48.0 + +RUN cd /tmp \ + && apt-get update \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* + +ARG NCCL_VERSION=2.27.7-1 + +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && rm -rf /tmp/nccl + +RUN cd /opt \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + +WORKDIR /dstack/run diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common deleted file mode 100644 index b486c1680..000000000 --- a/docker/base/Dockerfile.common +++ /dev/null @@ -1,44 +0,0 @@ -ARG UBUNTU_VERSION - -FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 - -ARG _UV_HOME="/opt/uv" - -ENV UV_INSTALL_DIR="${_UV_HOME}/bin" -ENV UV_MANAGED_PYTHON=1 -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 - -ENV PATH="${UV_INSTALL_DIR}:${PATH}" - -ENV OMPI_MCA_pml=^cm,ucx -ENV OMPI_MCA_btl=tcp,self -ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 -ENV NCCL_SOCKET_IFNAME=^docker,lo - -RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ - && apt-get update --fix-missing \ - && apt-get upgrade -y \ - && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ - && apt-get install -y tzdata \ - && dpkg-reconfigure --frontend noninteractive tzdata \ - && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ - libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ - && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ - && mkdir /run/sshd \ - && mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ - && chmod 600 ~/.ssh/authorized_keys \ - && rm /etc/ssh/ssh_host_* \ - # User - && apt-get install -y sudo \ - && groupadd -g 1000 dstack \ - && useradd -u 1000 -g 1000 -G sudo -s /bin/bash -m dstack \ - && echo 'dstack ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/dstack \ - # Default working dir - && mkdir -p /dstack/run \ - && chmod a+rwx /dstack/run \ - # Cleanup - && rm -rf /var/lib/apt/lists/* - -RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \ - && uv python install --preview --default diff --git a/docker/base/README.md b/docker/base/README.md index 3bbbafa73..5246cbc40 100644 --- a/docker/base/README.md +++ b/docker/base/README.md @@ -1 +1,19 @@ -Image for `dstack` runner instances. +# dstack base images + +Base images for `dstack` runner instances. A single multi-stage `Dockerfile` +produces all flavors; select one with `docker build --target `: + +- **base** — CUDA 12.8, Python (uv-managed), NCCL 2.26.2-1 + NCCL Tests, Open MPI. +- **devel** — `base` plus the CUDA development libraries and NVCC. +- **devel-efa** — `base` plus CUDA dev libraries, AWS EFA Installer 1.48.0 + (Libfabric + Open MPI + AWS OFI NCCL 1.19.0), and an EFA-aware NCCL 2.27.7-1 + build + NCCL Tests. + +Build args: `UBUNTU_VERSION` (e.g. `24`). + +Example: + +```bash +docker build --target devel-efa --build-arg UBUNTU_VERSION=24 \ + -t dstackai/base:local-devel-efa-ubuntu24.04 -f base/Dockerfile . +``` diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile deleted file mode 100644 index 3ea6a4970..000000000 --- a/docker/base/efa/Dockerfile +++ /dev/null @@ -1,54 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -INCLUDE+ base/Dockerfile.common - -ENV NCCL_HOME=/usr/local -ENV CUDA_HOME=/usr/local/cuda -ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" - -# Prerequisites - -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - && rm -rf /var/lib/apt/lists/* - -# EFA - -ARG EFA_VERSION=1.48.0 - -RUN cd /tmp \ - && apt-get update \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ - && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g \ - && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* - -# NCCL - -ARG NCCL_VERSION=2.27.7-1 - -RUN cd /tmp \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && rm -rf /tmp/nccl - -# NCCL Tests - -RUN cd /opt \ - && git clone https://github.com/NVIDIA/nccl-tests \ - && cd nccl-tests \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} - -WORKDIR /dstack/run diff --git a/docker/base/efa/README.md b/docker/base/efa/README.md deleted file mode 100644 index 9790d84d7..000000000 --- a/docker/base/efa/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# dstack AWS EFA - -This image has the following installed: - -* CUDA 12.8 -* AWS EFA Installer 1.48.0 (Libfabric + Open MPI + AWS OFI NCCL 1.19.0) -* NCCL 2.27.7-1 -* NCCL Tests