Skip to content

Commit

Permalink
feat: support lts and production nvidia modules
Browse files Browse the repository at this point in the history
Support LTS and production versions of NVIDIA kernel modules as per https://docs.nvidia.com/datacenter/tesla/drivers/index.html#lifecycle

Part of: siderolabs/talos#9086

Signed-off-by: Noel Georgi <[email protected]>
  • Loading branch information
frezbo committed Aug 6, 2024
1 parent 6e6f029 commit 37f2297
Show file tree
Hide file tree
Showing 48 changed files with 625 additions and 71 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-08-01T13:26:11Z by kres faf91e3.
# Generated on 2024-08-06T13:24:11Z by kres 133368e.

name: default
concurrency:
Expand Down Expand Up @@ -33,7 +33,7 @@ jobs:
labels: ${{ steps.retrieve-pr-labels.outputs.result }}
services:
buildkitd:
image: moby/buildkit:v0.15.0
image: moby/buildkit:v0.15.1
options: --privileged
ports:
- 1234:1234
Expand Down Expand Up @@ -143,7 +143,7 @@ jobs:
- default
services:
buildkitd:
image: moby/buildkit:v0.15.0
image: moby/buildkit:v0.15.1
options: --privileged
ports:
- 1234:1234
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/weekly.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-08-01T13:26:11Z by kres faf91e3.
# Generated on 2024-08-06T13:24:11Z by kres 133368e.

name: weekly
concurrency:
Expand All @@ -16,7 +16,7 @@ jobs:
- pkgs
services:
buildkitd:
image: moby/buildkit:v0.15.0
image: moby/buildkit:v0.15.1
options: --privileged
ports:
- 1234:1234
Expand Down
14 changes: 9 additions & 5 deletions .kres.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@ spec:
- mdadm
- mei
- nut-client
- nvidia-container-toolkit
- nvidia-fabricmanager
- nvidia-open-gpu-kernel-modules
- nvidia-container-toolkit-lts
- nvidia-container-toolkit-production
- nvidia-fabricmanager-lts
- nvidia-fabricmanager-production
- nvidia-open-gpu-kernel-modules-lts
- nvidia-open-gpu-kernel-modules-production
- qemu-guest-agent
- qlogic-firmware
- realtek-firmware
Expand All @@ -43,7 +46,8 @@ spec:
- zfs
additionalTargets:
nonfree:
- nonfree-kmod-nvidia
- nonfree-kmod-nvidia-lts
- nonfree-kmod-nvidia-production
reproducibleTargetName: reproducibility
extraBuildArgs:
- TAG
Expand All @@ -54,7 +58,7 @@ spec:
- name: EXTENSIONS_IMAGE_REF
defaultValue: $(REGISTRY_AND_USERNAME)/extensions:$(TAG)
- name: PKGS
defaultValue: v1.8.0-alpha.0-41-ga97d58f
defaultValue: v1.8.0-alpha.0-45-gaf6b4e6
- name: PKGS_PREFIX
defaultValue: ghcr.io/siderolabs
useBldrPkgTagResolver: true
Expand Down
19 changes: 11 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-08-01T17:25:51Z by kres faf91e3.
# Generated on 2024-08-06T11:36:28Z by kres 2fded2b.

# common variables

Expand All @@ -25,7 +25,7 @@ SOURCE_DATE_EPOCH := $(shell git log $(INITIAL_COMMIT_SHA) --pretty=%ct)

# sync bldr image with pkgfile

BLDR_RELEASE := v0.3.1
BLDR_RELEASE := v0.3.2
BLDR_IMAGE := ghcr.io/siderolabs/bldr:$(BLDR_RELEASE)
BLDR := docker run --rm --user $(shell id -u):$(shell id -g) --volume $(PWD):/src --entrypoint=/bldr $(BLDR_IMAGE) --root=/src

Expand All @@ -48,7 +48,7 @@ COMMON_ARGS += --build-arg=PKGS_PREFIX="$(PKGS_PREFIX)"
# extra variables

EXTENSIONS_IMAGE_REF ?= $(REGISTRY_AND_USERNAME)/extensions:$(TAG)
PKGS ?= v1.8.0-alpha.0-41-ga97d58f
PKGS ?= v1.8.0-alpha.0-45-gaf6b4e6
PKGS_PREFIX ?= ghcr.io/siderolabs

# targets defines all the available targets
Expand All @@ -64,7 +64,6 @@ TARGETS += drbd
TARGETS += ecr-credential-provider
TARGETS += fuse3
TARGETS += gasket-driver
TARGETS += crun
TARGETS += gvisor
TARGETS += gvisor-debug
TARGETS += hello-world-service
Expand All @@ -76,9 +75,12 @@ TARGETS += kata-containers
TARGETS += mdadm
TARGETS += mei
TARGETS += nut-client
TARGETS += nvidia-container-toolkit
TARGETS += nvidia-fabricmanager
TARGETS += nvidia-open-gpu-kernel-modules
TARGETS += nvidia-container-toolkit-lts
TARGETS += nvidia-container-toolkit-production
TARGETS += nvidia-fabricmanager-lts
TARGETS += nvidia-fabricmanager-production
TARGETS += nvidia-open-gpu-kernel-modules-lts
TARGETS += nvidia-open-gpu-kernel-modules-production
TARGETS += qemu-guest-agent
TARGETS += qlogic-firmware
TARGETS += realtek-firmware
Expand All @@ -93,7 +95,8 @@ TARGETS += vmtoolsd-guest-agent
TARGETS += wasmedge
TARGETS += xen-guest-agent
TARGETS += zfs
NONFREE_TARGETS = nonfree-kmod-nvidia
NONFREE_TARGETS = nonfree-kmod-nvidia-lts
NONFREE_TARGETS += nonfree-kmod-nvidia-production

# help menu

Expand Down
21 changes: 21 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,27 @@ Gvisor now ships an additional runtime using `kvm` as the sandboxing mechanism.
title = "Intel Management Engine"
description = """
Intel Management Engine (IME) modules is now shipped as a Talos System Extension.
"""

[notes.nvidia]
title = "NVIDIA Driver and Container Toolkit"
description = """
The NVIDIA drivers and the container toolkits now ships an LTS and Production version as per https://docs.nvidia.com/datacenter/tesla/drivers/index.html#lifecycle.
The new extensions are named below:
* nvidia-container-toolkit-production
* nvidia-container-toolkit-lts
* nvidia-open-gpu-kernel-modules-production
* nvidia-open-gpu-kernel-modules-lts
* nonfree-kmod-nvidia-lts
* nonfree-kmod-nvidia-production
The extensions would ship the latest version of LTS/Production drivers available at the time of Talos release.
Image Factory using an existing schematic id would upgrade the NVIDIA driver and container toolkit to the LTS version.
If production version is required, the schematic id should be updated to the production version.
"""

[notes.updates]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version: v1alpha1
metadata:
name: nonfree-kmod-nvidia
name: nonfree-kmod-nvidia-lts
version: "$VERSION"
author: Sidero Labs
description: |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
name: nonfree-kmod-nvidia
name: nonfree-kmod-nvidia-lts
variant: scratch
shell: /toolchain/bin/bash
dependencies:
- stage: base
# The pkgs version for a particular release of Talos as defined in
# https://github.com/siderolabs/talos/blob/<talos version>/pkg/machinery/gendata/data/pkgs
- image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-pkg:{{ .BUILD_ARG_PKGS }}"
- image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-lts-pkg:{{ .BUILD_ARG_PKGS }}"
steps:
- prepare:
- |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# the first part is the driver version and the second the talos version for which the module is built against
VERSION: "{{ .NVIDIA_DRIVER_VERSION }}-{{ .BUILD_ARG_TAG }}"
VERSION: "{{ .NVIDIA_DRIVER_LTS_VERSION }}-{{ .BUILD_ARG_TAG }}"
File renamed without changes.
10 changes: 10 additions & 0 deletions nvidia-gpu/nonfree/kmod-nvidia/production/manifest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
version: v1alpha1
metadata:
name: nonfree-kmod-nvidia-production
version: "$VERSION"
author: Sidero Labs
description: |
This system extension provides nvidia proprietary kernel modules built against a specific Talos version.
compatibility:
talos:
version: ">= v1.5.0"
31 changes: 31 additions & 0 deletions nvidia-gpu/nonfree/kmod-nvidia/production/pkg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: nonfree-kmod-nvidia-production
variant: scratch
shell: /toolchain/bin/bash
dependencies:
- stage: base
# The pkgs version for a particular release of Talos as defined in
# https://github.com/siderolabs/talos/blob/<talos version>/pkg/machinery/gendata/data/pkgs
- image: "{{ .BUILD_ARG_PKGS_PREFIX }}/nonfree-kmod-nvidia-production-pkg:{{ .BUILD_ARG_PKGS }}"
steps:
- prepare:
- |
sed -i 's#$VERSION#{{ .VERSION }}#' /pkg/manifest.yaml
- install:
- |
mkdir -p /rootfs/lib/modules \
/rootfs/usr/local/lib/modprobe.d
cp /pkg/files/nvidia.conf /rootfs/usr/local/lib/modprobe.d/nvidia.conf
cp -R /lib/modules/* /rootfs/lib/modules
test:
- |
mkdir -p /extensions-validator-rootfs
cp -r /rootfs/ /extensions-validator-rootfs/rootfs
cp /pkg/manifest.yaml /extensions-validator-rootfs/manifest.yaml
/extensions-validator validate --rootfs=/extensions-validator-rootfs --pkg-name="${PKG_NAME}"
finalize:
- from: /rootfs
to: /rootfs
- from: /pkg/manifest.yaml
to: /
2 changes: 2 additions & 0 deletions nvidia-gpu/nonfree/kmod-nvidia/production/vars.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# the first part is the driver version and the second the talos version for which the module is built against
VERSION: "{{ .NVIDIA_DRIVER_PRODUCTION_VERSION }}-{{ .BUILD_ARG_TAG }}"
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
version: v1alpha1
metadata:
name: nvidia-container-toolkit
name: nvidia-container-toolkit-lts
version: "$VERSION"
author: Sidero Labs
description: |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: nvidia-container-toolkit
name: nvidia-container-toolkit-lts
variant: scratch
shell: /toolchain/bin/bash
dependencies:
- stage: base
- stage: nvidia-container-cli
- stage: nvidia-container-cli-lts
- stage: elfutils
- stage: zlib
- stage: libcap
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# the first part is the driver version and the second the container-toolkit version
VERSION: "{{ .NVIDIA_DRIVER_VERSION }}-{{ .CONTAINER_TOOLKIT_VERSION }}"
VERSION: "{{ .NVIDIA_DRIVER_LTS_VERSION }}-{{ .CONTAINER_TOOLKIT_VERSION }}"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: nvidia-container-cli
name: nvidia-container-cli-lts
variant: scratch
shell: /bin/bash
install:
Expand All @@ -15,7 +15,7 @@ dependencies:
# nvidia-pkgs depends on glibc,
# so any stage depending on nvidia-container-cli will have the updated ld.so.cache,
# from both nvidia-pkgs and nvidia-container-cli
- stage: nvidia-pkgs
- stage: nvidia-pkgs-lts
- stage: libseccomp
from: /rootfs
- stage: libcap
Expand All @@ -28,10 +28,10 @@ dependencies:
from: /rootfs
steps:
- sources:
- url: https://gitlab.com/nvidia/container-toolkit/libnvidia-container/-/archive/{{ .LIBNVIDIA_CONTAINER_VERSION }}/libnvidia-container-{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz
- url: https://github.com/NVIDIA/libnvidia-container/archive/refs/tags/{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz
destination: libnvidia-container.tar.gz
sha256: d23984591004c59c33f6f13c8237f1fb84113b8eddb0f9943302df4c3b0cc549
sha512: a5a75b0cd29cf7c0484dbd650456c93bb495a0fe5449d6b8c7680af7509be3b9e1f12ab437b56309bfb4b66cfe2868b4adbe882e29b169c7733c0247ecf2489b
sha256: cbc1dda7ee90b8b729c5f178292cd07b421863015d84b84c37e69c8d580ab3ff
sha512: b304c284c5ab0c3544362307dc16ffcca8d34497e4356a520dc6da81a86a62b2a262b528cba559bb0d7a3addf018c3b50b6cb78669c82c1b4acae159e5922548
env:
SOURCE_DATE_EPOCH: {{ .BUILD_ARG_SOURCE_DATE_EPOCH }}
REVISION: {{ .LIBNVIDIA_CONTAINER_REF }}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
name: nvidia-container-cli-production
variant: scratch
shell: /bin/bash
install:
- build-base
- bash
- go
- coreutils
- sed
- curl
- rpcsvc-proto
- patch
dependencies:
- image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }}
# nvidia-pkgs depends on glibc,
# so any stage depending on nvidia-container-cli will have the updated ld.so.cache,
# from both nvidia-pkgs and nvidia-container-cli
- stage: nvidia-pkgs-production
- stage: libseccomp
from: /rootfs
- stage: libcap
from: /rootfs
- stage: elfutils
from: /rootfs
- stage: zlib
from: /rootfs
- stage: libtirpc
from: /rootfs
steps:
- sources:
- url: https://github.com/NVIDIA/libnvidia-container/archive/refs/tags/{{ .LIBNVIDIA_CONTAINER_VERSION }}.tar.gz
destination: libnvidia-container.tar.gz
sha256: cbc1dda7ee90b8b729c5f178292cd07b421863015d84b84c37e69c8d580ab3ff
sha512: b304c284c5ab0c3544362307dc16ffcca8d34497e4356a520dc6da81a86a62b2a262b528cba559bb0d7a3addf018c3b50b6cb78669c82c1b4acae159e5922548
env:
SOURCE_DATE_EPOCH: {{ .BUILD_ARG_SOURCE_DATE_EPOCH }}
REVISION: {{ .LIBNVIDIA_CONTAINER_REF }}
LIB_VERSION: {{ .LIBNVIDIA_CONTAINER_VERSION | replace "v" "" }}
WITH_NVCGO: yes
WITH_LIBELF: yes
WITH_TIRPC: no # setting no means we'll use the system libtirpc
WITH_SECCOMP: yes
PKG_CONFIG_PATH: /usr/local/glibc/lib/pkgconfig # to find runtime libraries compiled in extensions (libseccomp)
PATH: "/usr/bin:{{ .PATH }}" # bldr doesn't have /usr/bin in PATH
prepare:
- |
mkdir libnvidia-container
tar -xzf libnvidia-container.tar.gz --strip-components=1 -C libnvidia-container
build:
- |
cd libnvidia-container
# LDLIBS=-L/usr/local/glibc/lib is set so that libnvidia-container-cli libs which are hardcoded as -llibname and not using pkg-config
CPPFLAGS="-I/usr/local/glibc/include/tirpc" LDLIBS="-L/usr/local/glibc/lib -ltirpc -lelf -lseccomp" LDFLAGS='-Wl,--rpath=\$$ORIGIN/../glibc/\$$LIB' make
install:
- |
mkdir -p /rootfs
cd libnvidia-container
make install DESTDIR=/rootfs
# run ldconfig to update the cache
/rootfs/usr/local/glibc/sbin/ldconfig -r /rootfs
finalize:
- from: /rootfs
to: /rootfs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ dependencies:
- image: cgr.dev/chainguard/wolfi-base@{{ .WOLFI_BASE_REF }}
steps:
- sources:
- url: https://gitlab.com/nvidia/container-toolkit/container-toolkit/-/archive/{{ .CONTAINER_TOOLKIT_VERSION }}/container-toolkit-{{ .CONTAINER_TOOLKIT_VERSION }}.tar.gz
- url: https://github.com/NVIDIA/nvidia-container-toolkit/archive/refs/tags/{{ .CONTAINER_TOOLKIT_VERSION }}.tar.gz
destination: container-toolkit.tar.gz
sha256: b006700e31ed1475ed25695770cab10d74fdac55cdb94e66d70468740482fb53
sha512: 11ceffddb164194d0f10c60aeec2c1e20c699a6f3cb1887bca8f49496c9fda869c6c65f1f5f8e816467abee43da002fe2922b8e68ba8f6e61d30f635509da5e0
sha256: 38a193444e0342c0a2c0d3664403e2c341eb77f1461b3f9172fd93c04de82165
sha512: 691d4fc47ea60b730ec491b333aa8118bcfd62cdab20a42b84155c6a13484d920e758435b5029bbae4fbefce82352aa5764f1554992682f689c95615809fb83c
env:
GIT_COMMIT: {{ substr 0 7 .CONTAINER_TOOLKIT_REF }} # build is using short sha
prepare:
Expand Down
Loading

0 comments on commit 37f2297

Please sign in to comment.