From b335fc09706b0072b7c7e7fd63be7aa07e26b5a9 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 16 Jun 2026 13:27:35 +0500 Subject: [PATCH 1/7] Update docker base image to ubuntu 24.04 --- .github/workflows/docker.yml | 2 +- docker/base/Dockerfile | 1 - docker/base/Dockerfile.common | 7 ++++++- src/tests/_internal/server/routers/test_runs.py | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 2b593144bd..ffdde2ca4a 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -24,7 +24,7 @@ jobs: strategy: matrix: flavor: ["base", "devel", "devel-efa"] - ubuntu_version: ["22"] + ubuntu_version: ["24"] steps: - name: Checkout repository uses: actions/checkout@v4 diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 29e92f45ba..96b4f8a1d0 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -11,7 +11,6 @@ ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi # Prerequisites RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ && apt-get update --fix-missing \ && apt-get upgrade -y \ && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index b486c16800..1d4234aa5c 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -16,7 +16,6 @@ ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 ENV NCCL_SOCKET_IFNAME=^docker,lo RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \ && apt-get update --fix-missing \ && apt-get upgrade -y \ && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ @@ -24,6 +23,12 @@ RUN export DEBIAN_FRONTEND=noninteractive \ && dpkg-reconfigure --frontend noninteractive tzdata \ && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ + # nvidia/cuda ships nvidia's apt singing key in legacy format (/etc/apt/trusted.gpg). + # This lead to warnings, so we install cuda-keyring. + && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-keyring_1.1-1_all.deb \ + && dpkg -i cuda-keyring_1.1-1_all.deb \ + && rm cuda-keyring_1.1-1_all.deb \ + && rm -f /etc/apt/sources.list.d/cuda.list \ && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ && mkdir /run/sshd \ && mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index b19a736bb4..6526c5d77f 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -390,7 +390,7 @@ def get_dev_env_run_dict( " && tail -f /dev/null" ), ] - image_name = "dstackai/base:0.13-base-ubuntu22.04" + image_name = f"dstackai/base:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}-base-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" return { "id": run_id, From 9ad0ac53e35e6bc26e980b308541d1c779effb69 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 16 Jun 2026 14:03:24 +0500 Subject: [PATCH 2/7] Remove subversion and mercurial from docker image --- docker/base/Dockerfile.common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index 1d4234aa5c..0bb37d09a9 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -21,7 +21,7 @@ RUN export DEBIAN_FRONTEND=noninteractive \ && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ && apt-get install -y tzdata \ && dpkg-reconfigure --frontend noninteractive tzdata \ - && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 mercurial openssh-server subversion wget \ + && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 openssh-server wget \ libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ # nvidia/cuda ships nvidia's apt singing key in legacy format (/etc/apt/trusted.gpg). # This lead to warnings, so we install cuda-keyring. From 14bd27af6713261b2cf58e76855a06b9b10985b3 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 16 Jun 2026 14:16:25 +0500 Subject: [PATCH 3/7] Restore tests update --- src/tests/_internal/server/routers/test_runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/_internal/server/routers/test_runs.py b/src/tests/_internal/server/routers/test_runs.py index 6526c5d77f..b19a736bb4 100644 --- a/src/tests/_internal/server/routers/test_runs.py +++ b/src/tests/_internal/server/routers/test_runs.py @@ -390,7 +390,7 @@ def get_dev_env_run_dict( " && tail -f /dev/null" ), ] - image_name = f"dstackai/base:{settings.DSTACK_DOCKER_BASE_IMAGE_VERSION}-base-ubuntu{settings.DSTACK_DOCKER_BASE_IMAGE_UBUNTU_VERSION}" + image_name = "dstackai/base:0.13-base-ubuntu22.04" return { "id": run_id, From 7cfcf9a1917ebeedfc3d5375e721df9ac2716036 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 16 Jun 2026 14:34:27 +0500 Subject: [PATCH 4/7] Redeclare UBUNTU_VERSION --- docker/base/Dockerfile | 3 +++ docker/base/Dockerfile.common | 3 +++ 2 files changed, 6 insertions(+) diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 96b4f8a1d0..5f51896251 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -4,6 +4,9 @@ ARG UBUNTU_VERSION # Build stage FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder +# ARGs before FROM must be redeclared +ARG UBUNTU_VERSION + ENV NCCL_HOME=/opt/nccl ENV CUDA_HOME=/usr/local/cuda ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index 0bb37d09a9..a40ce986c3 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -2,6 +2,9 @@ ARG UBUNTU_VERSION FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 +# ARGs before FROM must be redeclared +ARG UBUNTU_VERSION + ARG _UV_HOME="/opt/uv" ENV UV_INSTALL_DIR="${_UV_HOME}/bin" From fd350ff4516c7d31e3f46ca9433241b99dfc1d3b Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 16 Jun 2026 14:55:51 +0500 Subject: [PATCH 5/7] Fix /root/.ssh already exists --- docker/base/Dockerfile.common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common index a40ce986c3..4824b1f95e 100644 --- a/docker/base/Dockerfile.common +++ b/docker/base/Dockerfile.common @@ -34,7 +34,7 @@ RUN export DEBIAN_FRONTEND=noninteractive \ && rm -f /etc/apt/sources.list.d/cuda.list \ && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ && mkdir /run/sshd \ - && mkdir ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ + && mkdir -p ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ && chmod 600 ~/.ssh/authorized_keys \ && rm /etc/ssh/ssh_host_* \ # User From b3f5c837abc29bc352ad7ca8499f5778e7aa3a86 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 16 Jun 2026 15:58:36 +0500 Subject: [PATCH 6/7] Replace dockerfile-plus with docker buildx build --target --- .github/workflows/docker.yml | 11 +-- docker/base/Dockerfile | 154 ++++++++++++++++++++++++++++------ docker/base/Dockerfile.common | 52 ------------ docker/base/README.md | 20 ++++- docker/base/efa/Dockerfile | 54 ------------ docker/base/efa/README.md | 8 -- 6 files changed, 150 insertions(+), 149 deletions(-) delete mode 100644 docker/base/Dockerfile.common delete mode 100644 docker/base/efa/Dockerfile delete mode 100644 docker/base/efa/README.md diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index ffdde2ca4a..c17bf2cd95 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -45,18 +45,11 @@ jobs: df -h / - name: Build and upload to DockerHub run: | - if [ "${{ matrix.flavor }}" = "base" ]; then - FILE="base/Dockerfile" - elif [ "${{ matrix.flavor }}" = "devel" ]; then - FILE="base/Dockerfile" - else - FILE="base/efa/Dockerfile" - fi docker buildx build \ --platform linux/amd64 \ + --target ${{ matrix.flavor }} \ --tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \ - --build-arg FLAVOR=${{ matrix.flavor }} \ --build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \ --provenance=false \ --push \ - -f $FILE . + -f base/Dockerfile . diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 5f51896251..70b0dd8c50 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,18 +1,71 @@ -# syntax = edrevo/dockerfile-plus ARG UBUNTU_VERSION -# Build stage -FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder +# ============================================================================ +# common: shared base for all flavors. Select a flavor with `--target ` +# (base / devel / devel-efa). +# ============================================================================ -# ARGs before FROM must be redeclared +FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS common + +# ARGs before FROM must be redeclared to be used after FROM ARG UBUNTU_VERSION +ARG _UV_HOME="/opt/uv" + +ENV UV_INSTALL_DIR="${_UV_HOME}/bin" +ENV UV_MANAGED_PYTHON=1 +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +ENV PATH="${UV_INSTALL_DIR}:${PATH}" + +ENV OMPI_MCA_pml=^cm,ucx +ENV OMPI_MCA_btl=tcp,self +ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 +ENV NCCL_SOCKET_IFNAME=^docker,lo + +RUN export DEBIAN_FRONTEND=noninteractive \ + && apt-get update --fix-missing \ + && apt-get upgrade -y \ + && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ + && apt-get install -y tzdata \ + && dpkg-reconfigure --frontend noninteractive tzdata \ + && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 openssh-server wget \ + libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ + # nvidia/cuda ships nvidia's apt singing key in legacy format (/etc/apt/trusted.gpg). + # This lead to warnings, so we install cuda-keyring. + && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-keyring_1.1-1_all.deb \ + && dpkg -i cuda-keyring_1.1-1_all.deb \ + && rm cuda-keyring_1.1-1_all.deb \ + && rm -f /etc/apt/sources.list.d/cuda.list \ + && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ + && mkdir /run/sshd \ + && mkdir -p ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ + && chmod 600 ~/.ssh/authorized_keys \ + && rm /etc/ssh/ssh_host_* \ + # User + && apt-get install -y sudo \ + && groupadd -g 1000 dstack \ + && useradd -u 1000 -g 1000 -G sudo -s /bin/bash -m dstack \ + && echo 'dstack ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/dstack \ + # Default working dir + && mkdir -p /dstack/run \ + && chmod a+rwx /dstack/run \ + # Cleanup + && rm -rf /var/lib/apt/lists/* + +RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \ + && uv python install --preview --default + +# ============================================================================ +# builder: builds NCCL and nccl-tests from source for the base/devel flavors. +# ============================================================================ + +FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS builder + ENV NCCL_HOME=/opt/nccl ENV CUDA_HOME=/usr/local/cuda ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi -# Prerequisites - RUN export DEBIAN_FRONTEND=noninteractive \ && apt-get update --fix-missing \ && apt-get upgrade -y \ @@ -33,8 +86,6 @@ RUN export DEBIAN_FRONTEND=noninteractive \ python3 \ build-essential -# NCCL - ARG NCCL_VERSION=2.26.2-1 RUN cd /tmp \ @@ -42,8 +93,6 @@ RUN cd /tmp \ && cd nccl \ && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} -# NCCL tests - RUN cd /opt \ && git clone https://github.com/NVIDIA/nccl-tests \ && cd nccl-tests \ @@ -53,31 +102,86 @@ RUN cd /opt \ CUDA_HOME=${CUDA_HOME} \ NCCL_HOME=${NCCL_HOME} -# Final stage +# ============================================================================ +# base: common + NCCL (from builder) + OpenMPI. +# ============================================================================ -INCLUDE+ base/Dockerfile.common +FROM common AS base ENV NCCL_HOME=/opt/nccl COPY --from=builder ${NCCL_HOME} ${NCCL_HOME} COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build -ARG FLAVOR - -# MPI, NVCC, and /etc/ld.so.conf.d - RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - openmpi-bin \ - && if [ "$FLAVOR" = "devel" ]; then \ - cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - libhwloc-dev; \ - fi \ + && apt-get install -y --no-install-recommends openmpi-bin \ && rm -rf /var/lib/apt/lists/* \ && echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \ && ldconfig WORKDIR /dstack/run + +# ============================================================================ +# devel: base + CUDA development libraries and NVCC. +# ============================================================================ + +FROM base AS devel + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + libhwloc-dev \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /dstack/run + +# ============================================================================ +# devel-efa: common + CUDA dev libraries + AWS EFA + EFA-aware NCCL. +# ============================================================================ + +FROM common AS devel-efa + +ENV NCCL_HOME=/usr/local +ENV CUDA_HOME=/usr/local/cuda +ENV LIBFABRIC_PATH=/opt/amazon/efa +ENV OPEN_MPI_PATH=/opt/amazon/openmpi +ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" + +RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + cuda-libraries-dev-${cuda_version} \ + cuda-nvcc-${cuda_version} \ + && rm -rf /var/lib/apt/lists/* + +ARG EFA_VERSION=1.48.0 + +RUN cd /tmp \ + && apt-get update \ + && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ + && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && ./efa_installer.sh -y --skip-kmod -g \ + && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* + +ARG NCCL_VERSION=2.27.7-1 + +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ + && rm -rf /tmp/nccl + +RUN cd /opt \ + && git clone https://github.com/NVIDIA/nccl-tests \ + && cd nccl-tests \ + && make -j$(nproc) \ + MPI=1 \ + MPI_HOME=${OPEN_MPI_PATH} \ + CUDA_HOME=${CUDA_HOME} \ + NCCL_HOME=${NCCL_HOME} + +WORKDIR /dstack/run diff --git a/docker/base/Dockerfile.common b/docker/base/Dockerfile.common deleted file mode 100644 index 4824b1f95e..0000000000 --- a/docker/base/Dockerfile.common +++ /dev/null @@ -1,52 +0,0 @@ -ARG UBUNTU_VERSION - -FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 - -# ARGs before FROM must be redeclared -ARG UBUNTU_VERSION - -ARG _UV_HOME="/opt/uv" - -ENV UV_INSTALL_DIR="${_UV_HOME}/bin" -ENV UV_MANAGED_PYTHON=1 -ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 - -ENV PATH="${UV_INSTALL_DIR}:${PATH}" - -ENV OMPI_MCA_pml=^cm,ucx -ENV OMPI_MCA_btl=tcp,self -ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0 -ENV NCCL_SOCKET_IFNAME=^docker,lo - -RUN export DEBIAN_FRONTEND=noninteractive \ - && apt-get update --fix-missing \ - && apt-get upgrade -y \ - && ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \ - && apt-get install -y tzdata \ - && dpkg-reconfigure --frontend noninteractive tzdata \ - && apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 openssh-server wget \ - libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \ - # nvidia/cuda ships nvidia's apt singing key in legacy format (/etc/apt/trusted.gpg). - # This lead to warnings, so we install cuda-keyring. - && wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-keyring_1.1-1_all.deb \ - && dpkg -i cuda-keyring_1.1-1_all.deb \ - && rm cuda-keyring_1.1-1_all.deb \ - && rm -f /etc/apt/sources.list.d/cuda.list \ - && sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \ - && mkdir /run/sshd \ - && mkdir -p ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ - && chmod 600 ~/.ssh/authorized_keys \ - && rm /etc/ssh/ssh_host_* \ - # User - && apt-get install -y sudo \ - && groupadd -g 1000 dstack \ - && useradd -u 1000 -g 1000 -G sudo -s /bin/bash -m dstack \ - && echo 'dstack ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/dstack \ - # Default working dir - && mkdir -p /dstack/run \ - && chmod a+rwx /dstack/run \ - # Cleanup - && rm -rf /var/lib/apt/lists/* - -RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \ - && uv python install --preview --default diff --git a/docker/base/README.md b/docker/base/README.md index 3bbbafa732..5246cbc40b 100644 --- a/docker/base/README.md +++ b/docker/base/README.md @@ -1 +1,19 @@ -Image for `dstack` runner instances. +# dstack base images + +Base images for `dstack` runner instances. A single multi-stage `Dockerfile` +produces all flavors; select one with `docker build --target `: + +- **base** — CUDA 12.8, Python (uv-managed), NCCL 2.26.2-1 + NCCL Tests, Open MPI. +- **devel** — `base` plus the CUDA development libraries and NVCC. +- **devel-efa** — `base` plus CUDA dev libraries, AWS EFA Installer 1.48.0 + (Libfabric + Open MPI + AWS OFI NCCL 1.19.0), and an EFA-aware NCCL 2.27.7-1 + build + NCCL Tests. + +Build args: `UBUNTU_VERSION` (e.g. `24`). + +Example: + +```bash +docker build --target devel-efa --build-arg UBUNTU_VERSION=24 \ + -t dstackai/base:local-devel-efa-ubuntu24.04 -f base/Dockerfile . +``` diff --git a/docker/base/efa/Dockerfile b/docker/base/efa/Dockerfile deleted file mode 100644 index 3ea6a49704..0000000000 --- a/docker/base/efa/Dockerfile +++ /dev/null @@ -1,54 +0,0 @@ -# syntax = edrevo/dockerfile-plus - -INCLUDE+ base/Dockerfile.common - -ENV NCCL_HOME=/usr/local -ENV CUDA_HOME=/usr/local/cuda -ENV LIBFABRIC_PATH=/opt/amazon/efa -ENV OPEN_MPI_PATH=/opt/amazon/openmpi -ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}" -ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}" - -# Prerequisites - -RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \ - && apt-get update \ - && apt-get install -y --no-install-recommends \ - cuda-libraries-dev-${cuda_version} \ - cuda-nvcc-${cuda_version} \ - && rm -rf /var/lib/apt/lists/* - -# EFA - -ARG EFA_VERSION=1.48.0 - -RUN cd /tmp \ - && apt-get update \ - && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ - && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y --skip-kmod -g \ - && rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* - -# NCCL - -ARG NCCL_VERSION=2.27.7-1 - -RUN cd /tmp \ - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ - && cd nccl \ - && make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \ - && rm -rf /tmp/nccl - -# NCCL Tests - -RUN cd /opt \ - && git clone https://github.com/NVIDIA/nccl-tests \ - && cd nccl-tests \ - && make -j$(nproc) \ - MPI=1 \ - MPI_HOME=${OPEN_MPI_PATH} \ - CUDA_HOME=${CUDA_HOME} \ - NCCL_HOME=${NCCL_HOME} - -WORKDIR /dstack/run diff --git a/docker/base/efa/README.md b/docker/base/efa/README.md deleted file mode 100644 index 9790d84d78..0000000000 --- a/docker/base/efa/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# dstack AWS EFA - -This image has the following installed: - -* CUDA 12.8 -* AWS EFA Installer 1.48.0 (Libfabric + Open MPI + AWS OFI NCCL 1.19.0) -* NCCL 2.27.7-1 -* NCCL Tests From 5ceef59b7fe9e75083fc18552b02847fa5a2dc71 Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Tue, 16 Jun 2026 16:53:44 +0500 Subject: [PATCH 7/7] Change dstack user uid/guid to 1001 --- docker/base/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 70b0dd8c50..6c01f2e94d 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -42,10 +42,10 @@ RUN export DEBIAN_FRONTEND=noninteractive \ && mkdir -p ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \ && chmod 600 ~/.ssh/authorized_keys \ && rm /etc/ssh/ssh_host_* \ - # User + # User: UID/GID 1001 because Ubuntu 24.04 ships a default 'ubuntu' user at 1000. && apt-get install -y sudo \ - && groupadd -g 1000 dstack \ - && useradd -u 1000 -g 1000 -G sudo -s /bin/bash -m dstack \ + && groupadd -g 1001 dstack \ + && useradd -u 1001 -g 1001 -G sudo -s /bin/bash -m dstack \ && echo 'dstack ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/dstack \ # Default working dir && mkdir -p /dstack/run \