Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
strategy:
matrix:
flavor: ["base", "devel", "devel-efa"]
ubuntu_version: ["22"]
ubuntu_version: ["24"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand All @@ -45,18 +45,11 @@ jobs:
df -h /
- name: Build and upload to DockerHub
run: |
if [ "${{ matrix.flavor }}" = "base" ]; then
FILE="base/Dockerfile"
elif [ "${{ matrix.flavor }}" = "devel" ]; then
FILE="base/Dockerfile"
else
FILE="base/efa/Dockerfile"
fi
docker buildx build \
--platform linux/amd64 \
--target ${{ matrix.flavor }} \
--tag dstackai/${{ env.BUILD_DOCKER_REPO }}:${{ inputs.image_version }}-${{ matrix.flavor }}-ubuntu${{ matrix.ubuntu_version }}.04 \
--build-arg FLAVOR=${{ matrix.flavor }} \
--build-arg UBUNTU_VERSION=${{ matrix.ubuntu_version }} \
--provenance=false \
--push \
-f $FILE .
-f base/Dockerfile .
156 changes: 131 additions & 25 deletions docker/base/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,72 @@
# syntax = edrevo/dockerfile-plus
ARG UBUNTU_VERSION

# Build stage
FROM nvidia/cuda:12.8.1-base-ubuntu${UBUNTU_VERSION}.04 AS builder
# ============================================================================
# common: shared base for all flavors. Select a flavor with `--target <flavor>`
# (base / devel / devel-efa).
# ============================================================================

FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS common

# ARGs before FROM must be redeclared to be used after FROM
ARG UBUNTU_VERSION

ARG _UV_HOME="/opt/uv"

ENV UV_INSTALL_DIR="${_UV_HOME}/bin"
ENV UV_MANAGED_PYTHON=1
ENV LANG=C.UTF-8 LC_ALL=C.UTF-8

ENV PATH="${UV_INSTALL_DIR}:${PATH}"

ENV OMPI_MCA_pml=^cm,ucx
ENV OMPI_MCA_btl=tcp,self
ENV OMPI_MCA_btl_tcp_if_exclude=lo,docker0
ENV NCCL_SOCKET_IFNAME=^docker,lo

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get update --fix-missing \
&& apt-get upgrade -y \
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
&& apt-get install -y tzdata \
&& dpkg-reconfigure --frontend noninteractive tzdata \
&& apt-get install -y bzip2 ca-certificates curl build-essential git libglib2.0-0 libsm6 libxext6 libxrender1 openssh-server wget \
libibverbs1 ibverbs-providers ibverbs-utils libibverbs-dev infiniband-diags \
# nvidia/cuda ships nvidia's apt singing key in legacy format (/etc/apt/trusted.gpg).
# This lead to warnings, so we install cuda-keyring.
&& wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/cuda-keyring_1.1-1_all.deb \
&& dpkg -i cuda-keyring_1.1-1_all.deb \
&& rm cuda-keyring_1.1-1_all.deb \
&& rm -f /etc/apt/sources.list.d/cuda.list \
&& sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config \
&& mkdir /run/sshd \
&& mkdir -p ~/.ssh && chmod 700 ~/.ssh && touch ~/.ssh/authorized_keys \
&& chmod 600 ~/.ssh/authorized_keys \
&& rm /etc/ssh/ssh_host_* \
# User: UID/GID 1001 because Ubuntu 24.04 ships a default 'ubuntu' user at 1000.
&& apt-get install -y sudo \
&& groupadd -g 1001 dstack \
&& useradd -u 1001 -g 1001 -G sudo -s /bin/bash -m dstack \
&& echo 'dstack ALL=(ALL) NOPASSWD: ALL' > /etc/sudoers.d/dstack \
# Default working dir
&& mkdir -p /dstack/run \
&& chmod a+rwx /dstack/run \
# Cleanup
&& rm -rf /var/lib/apt/lists/*

RUN curl -LsSf https://astral.sh/uv/install.sh | INSTALLER_NO_MODIFY_PATH=1 sh \
&& uv python install --preview --default

# ============================================================================
# builder: builds NCCL and nccl-tests from source for the base/devel flavors.
# ============================================================================

FROM nvidia/cuda:12.8.2-base-ubuntu${UBUNTU_VERSION}.04 AS builder

ENV NCCL_HOME=/opt/nccl
ENV CUDA_HOME=/usr/local/cuda
ENV OPEN_MPI_PATH=/usr/lib/x86_64-linux-gnu/openmpi

# Prerequisites

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}04/x86_64/3bf863cc.pub \
&& apt-get update --fix-missing \
&& apt-get upgrade -y \
&& ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime \
Expand All @@ -31,17 +86,13 @@ RUN export DEBIAN_FRONTEND=noninteractive \
python3 \
build-essential

# NCCL

ARG NCCL_VERSION=2.26.2-1

RUN cd /tmp \
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
&& cd nccl \
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME}

# NCCL tests

RUN cd /opt \
&& git clone https://github.com/NVIDIA/nccl-tests \
&& cd nccl-tests \
Expand All @@ -51,31 +102,86 @@ RUN cd /opt \
CUDA_HOME=${CUDA_HOME} \
NCCL_HOME=${NCCL_HOME}

# Final stage
# ============================================================================
# base: common + NCCL (from builder) + OpenMPI.
# ============================================================================

INCLUDE+ base/Dockerfile.common
FROM common AS base

ENV NCCL_HOME=/opt/nccl

COPY --from=builder ${NCCL_HOME} ${NCCL_HOME}
COPY --from=builder /opt/nccl-tests/build /opt/nccl-tests/build

ARG FLAVOR

# MPI, NVCC, and /etc/ld.so.conf.d

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
openmpi-bin \
&& if [ "$FLAVOR" = "devel" ]; then \
cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
&& apt-get install -y --no-install-recommends \
cuda-libraries-dev-${cuda_version} \
cuda-nvcc-${cuda_version} \
libhwloc-dev; \
fi \
&& apt-get install -y --no-install-recommends openmpi-bin \
&& rm -rf /var/lib/apt/lists/* \
&& echo "${NCCL_HOME}/lib" >> /etc/ld.so.conf.d/nccl.conf \
&& ldconfig

WORKDIR /dstack/run

# ============================================================================
# devel: base + CUDA development libraries and NVCC.
# ============================================================================

FROM base AS devel

RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
cuda-libraries-dev-${cuda_version} \
cuda-nvcc-${cuda_version} \
libhwloc-dev \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /dstack/run

# ============================================================================
# devel-efa: common + CUDA dev libraries + AWS EFA + EFA-aware NCCL.
# ============================================================================

FROM common AS devel-efa

ENV NCCL_HOME=/usr/local
ENV CUDA_HOME=/usr/local/cuda
ENV LIBFABRIC_PATH=/opt/amazon/efa
ENV OPEN_MPI_PATH=/opt/amazon/openmpi
ENV PATH="${LIBFABRIC_PATH}/bin:${OPEN_MPI_PATH}/bin:${PATH}"
ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${LD_LIBRARY_PATH}"

RUN cuda_version=$(echo ${CUDA_VERSION} | awk -F . '{ print $1"-"$2 }') \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
cuda-libraries-dev-${cuda_version} \
cuda-nvcc-${cuda_version} \
&& rm -rf /var/lib/apt/lists/*

ARG EFA_VERSION=1.48.0

RUN cd /tmp \
&& apt-get update \
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
&& cd aws-efa-installer \
&& ./efa_installer.sh -y --skip-kmod -g \
&& rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*

ARG NCCL_VERSION=2.27.7-1

RUN cd /tmp \
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
&& cd nccl \
&& make -j$(nproc) src.build BUILDDIR=${NCCL_HOME} \
&& rm -rf /tmp/nccl

RUN cd /opt \
&& git clone https://github.com/NVIDIA/nccl-tests \
&& cd nccl-tests \
&& make -j$(nproc) \
MPI=1 \
MPI_HOME=${OPEN_MPI_PATH} \
CUDA_HOME=${CUDA_HOME} \
NCCL_HOME=${NCCL_HOME}

WORKDIR /dstack/run
44 changes: 0 additions & 44 deletions docker/base/Dockerfile.common

This file was deleted.

20 changes: 19 additions & 1 deletion docker/base/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,19 @@
Image for `dstack` runner instances.
# dstack base images

Base images for `dstack` runner instances. A single multi-stage `Dockerfile`
produces all flavors; select one with `docker build --target <flavor>`:

- **base** — CUDA 12.8, Python (uv-managed), NCCL 2.26.2-1 + NCCL Tests, Open MPI.
- **devel** — `base` plus the CUDA development libraries and NVCC.
- **devel-efa** — `base` plus CUDA dev libraries, AWS EFA Installer 1.48.0
(Libfabric + Open MPI + AWS OFI NCCL 1.19.0), and an EFA-aware NCCL 2.27.7-1
build + NCCL Tests.

Build args: `UBUNTU_VERSION` (e.g. `24`).

Example:

```bash
docker build --target devel-efa --build-arg UBUNTU_VERSION=24 \
-t dstackai/base:local-devel-efa-ubuntu24.04 -f base/Dockerfile .
```
54 changes: 0 additions & 54 deletions docker/base/efa/Dockerfile

This file was deleted.

8 changes: 0 additions & 8 deletions docker/base/efa/README.md

This file was deleted.

Loading