From f9ade1940c7fd32084d1b48b42db4d07efed5a9d Mon Sep 17 00:00:00 2001 From: tyeth Date: Tue, 17 Mar 2026 20:09:27 +0000 Subject: [PATCH 1/4] fix(gpu): add WSL2 GPU support via CDI mode and bundle device plugin chart WSL2 virtualises GPU access through /dev/dxg instead of native /dev/nvidia* device nodes, which breaks the entire NVIDIA k8s device plugin detection chain. Three changes fix this: 1. Detect WSL2 in cluster-entrypoint.sh and configure CDI mode: - Generate CDI spec with nvidia-ctk (auto-detects WSL mode) - Patch the spec to include libdxcore.so (nvidia-ctk bug omits it) - Switch nvidia-container-runtime from auto to cdi mode - Deploy a job to label the node with pci-10de.present=true (NFD can't see NVIDIA PCI on WSL2's virtualised bus) 2. Bundle the nvidia-device-plugin Helm chart in the cluster image instead of fetching from the upstream GitHub Pages repo at startup. The repo URL (nvidia.github.io/k8s-device-plugin/index.yaml) currently returns 404. 3. Update the HelmChart CR to reference the bundled local chart tarball via the k3s static charts API endpoint. Closes NVIDIA/OpenShell#404 --- deploy/docker/cluster-entrypoint.sh | 96 +++++++++++++++++++ .../nvidia-device-plugin-helmchart.yaml | 4 +- tasks/scripts/docker-build-cluster.sh | 12 +++ tasks/scripts/docker-publish-multiarch.sh | 12 +++ 4 files changed, 121 insertions(+), 3 deletions(-) diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 19fae35d..221eebb7 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -328,6 +328,102 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then cp "$manifest" "$K3S_MANIFESTS/" done fi + + # ------------------------------------------------------------------- + # WSL2 GPU support: CDI mode + libdxcore.so injection + node labeling + # ------------------------------------------------------------------- + # WSL2 virtualises GPU access through /dev/dxg instead of native + # /dev/nvidia* device nodes. The legacy nvidia-container-runtime + # injection path fails because: + # 1. NVML can't initialise without libdxcore.so (the bridge between + # Linux NVML and the Windows DirectX GPU Kernel via /dev/dxg) + # 2. NFD can't detect NVIDIA PCI vendor (WSL2 hides PCI topology) + # + # Fix: switch to CDI mode, patch the CDI spec with libdxcore.so, and + # add a k3s manifest that labels the node for the device plugin + # DaemonSet affinity. + if [ -c /dev/dxg ]; then + echo "WSL2 detected (/dev/dxg present) — configuring CDI mode for GPU" + + # 1. Generate CDI spec (nvidia-ctk auto-detects WSL mode) + if command -v nvidia-ctk >/dev/null 2>&1; then + mkdir -p /var/run/cdi + nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml 2>&1 || true + + # 2. Patch CDI spec: add libdxcore.so mount (nvidia-ctk misses it) + DXCORE_PATH=$(find /usr/lib -name "libdxcore.so" 2>/dev/null | head -1) + if [ -n "$DXCORE_PATH" ] && [ -f /var/run/cdi/nvidia.yaml ]; then + DXCORE_DIR=$(dirname "$DXCORE_PATH") + # Insert libdxcore mount after the mounts: key + sed -i "/^ mounts:/a\\ + - hostPath: $DXCORE_PATH\\ + containerPath: $DXCORE_PATH\\ + options:\\ + - ro\\ + - nosuid\\ + - nodev\\ + - rbind\\ + - rprivate" /var/run/cdi/nvidia.yaml + # Add ldcache folder for libdxcore directory + sed -i "s|update-ldcache|update-ldcache\n - --folder\n - $DXCORE_DIR|" /var/run/cdi/nvidia.yaml + echo "CDI spec patched with libdxcore.so from $DXCORE_PATH" + else + echo "Warning: libdxcore.so not found — NVML may fail inside pods" + fi + fi + + # 3. Switch nvidia container runtime to CDI mode + NVIDIA_RUNTIME_CONFIG="/etc/nvidia-container-runtime/config.toml" + if [ -f "$NVIDIA_RUNTIME_CONFIG" ]; then + sed -i 's/mode = "auto"/mode = "cdi"/' "$NVIDIA_RUNTIME_CONFIG" + echo "nvidia-container-runtime switched to CDI mode" + fi + + # 4. Create a k3s manifest to label the node with NVIDIA PCI vendor + # (NFD can't detect it on WSL2 since PCI topology is virtualised) + cat > "$K3S_MANIFESTS/wsl2-gpu-node-label.yaml" <<'WSLEOF' +apiVersion: batch/v1 +kind: Job +metadata: + name: wsl2-gpu-node-label + namespace: kube-system +spec: + template: + spec: + serviceAccountName: default + hostNetwork: true + tolerations: + - operator: Exists + containers: + - name: label + image: rancher/mirrored-library-busybox:1.37.0 + command: + - /bin/sh + - -c + - | + # Wait for the API server, then label the node + until wget -qO- --no-check-certificate https://kubernetes.default.svc/api/v1/nodes 2>/dev/null | grep -q '"items"'; do + sleep 2 + done + NODE=$(wget -qO- --no-check-certificate \ + -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \ + https://kubernetes.default.svc/api/v1/nodes 2>/dev/null \ + | sed -n 's/.*"name":"\([^"]*\)".*/\1/p' | head -1) + if [ -n "$NODE" ]; then + wget -qO- --no-check-certificate \ + -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \ + -H "Content-Type: application/strategic-merge-patch+json" \ + --method=PATCH \ + --body-data='{"metadata":{"labels":{"feature.node.kubernetes.io/pci-10de.present":"true"}}}' \ + "https://kubernetes.default.svc/api/v1/nodes/$NODE" >/dev/null 2>&1 \ + && echo "Labeled node $NODE with pci-10de.present=true" \ + || echo "Warning: failed to label node $NODE" + fi + restartPolicy: OnFailure + backoffLimit: 10 +WSLEOF + echo "WSL2 GPU node-label job manifest installed" + fi fi # --------------------------------------------------------------------------- diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml index 57503d31..4aa6743d 100644 --- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml +++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml @@ -19,9 +19,7 @@ metadata: name: nvidia-device-plugin namespace: kube-system spec: - repo: https://nvidia.github.io/k8s-device-plugin - chart: nvidia-device-plugin - version: "0.18.2" + chart: https://%{KUBERNETES_API}%/static/charts/nvidia-device-plugin-0.18.2.tgz targetNamespace: nvidia-device-plugin createNamespace: true valuesContent: |- diff --git a/tasks/scripts/docker-build-cluster.sh b/tasks/scripts/docker-build-cluster.sh index 80dc2a48..ef9f9393 100755 --- a/tasks/scripts/docker-build-cluster.sh +++ b/tasks/scripts/docker-build-cluster.sh @@ -53,6 +53,18 @@ mkdir -p deploy/docker/.build/charts echo "Packaging helm chart..." helm package deploy/helm/openshell -d deploy/docker/.build/charts/ +# Download nvidia-device-plugin chart for GPU support (bundled to avoid +# dependency on the upstream GitHub Pages Helm repo at cluster start time) +NVIDIA_DP_VERSION="0.18.2" +NVIDIA_DP_CHART="deploy/docker/.build/charts/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" +if [ ! -f "$NVIDIA_DP_CHART" ]; then + echo "Downloading nvidia-device-plugin chart v${NVIDIA_DP_VERSION}..." + curl -fsSL "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/gh-pages/stable/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" \ + -o "$NVIDIA_DP_CHART" || { + echo "Warning: failed to download nvidia-device-plugin chart; GPU support may not work" + } +fi + # Build cluster image (no bundled component images — they are pulled at runtime # from the distribution registry; credentials are injected at deploy time) echo "Building cluster image..." diff --git a/tasks/scripts/docker-publish-multiarch.sh b/tasks/scripts/docker-publish-multiarch.sh index 7bb6dc84..7395ed0c 100755 --- a/tasks/scripts/docker-publish-multiarch.sh +++ b/tasks/scripts/docker-publish-multiarch.sh @@ -176,6 +176,18 @@ mkdir -p deploy/docker/.build/charts echo "Packaging helm chart..." helm package deploy/helm/openshell -d deploy/docker/.build/charts/ +# Download nvidia-device-plugin chart for GPU support (bundled to avoid +# dependency on the upstream GitHub Pages Helm repo at cluster start time) +NVIDIA_DP_VERSION="0.18.2" +NVIDIA_DP_CHART="deploy/docker/.build/charts/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" +if [ ! -f "$NVIDIA_DP_CHART" ]; then + echo "Downloading nvidia-device-plugin chart v${NVIDIA_DP_VERSION}..." + curl -fsSL "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/gh-pages/stable/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" \ + -o "$NVIDIA_DP_CHART" || { + echo "Warning: failed to download nvidia-device-plugin chart; GPU support may not work" + } +fi + # --------------------------------------------------------------------------- # Step 3: Build and push multi-arch cluster image. # The cluster image includes the supervisor binary (built from Rust source) From af1ae24e82c1707a5ce42deaa2babad1fc1ace94 Mon Sep 17 00:00:00 2001 From: tyeth Date: Tue, 17 Mar 2026 20:20:45 +0000 Subject: [PATCH 2/4] fix(gpu): revert helm chart bundling, keep only WSL2 CDI fix The upstream Helm repo URL works fine; remove the unnecessary chart bundling and local reference changes. --- .../nvidia-device-plugin-helmchart.yaml | 4 +++- tasks/scripts/docker-build-cluster.sh | 12 ------------ tasks/scripts/docker-publish-multiarch.sh | 12 ------------ 3 files changed, 3 insertions(+), 25 deletions(-) diff --git a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml index 4aa6743d..57503d31 100644 --- a/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml +++ b/deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml @@ -19,7 +19,9 @@ metadata: name: nvidia-device-plugin namespace: kube-system spec: - chart: https://%{KUBERNETES_API}%/static/charts/nvidia-device-plugin-0.18.2.tgz + repo: https://nvidia.github.io/k8s-device-plugin + chart: nvidia-device-plugin + version: "0.18.2" targetNamespace: nvidia-device-plugin createNamespace: true valuesContent: |- diff --git a/tasks/scripts/docker-build-cluster.sh b/tasks/scripts/docker-build-cluster.sh index ef9f9393..80dc2a48 100755 --- a/tasks/scripts/docker-build-cluster.sh +++ b/tasks/scripts/docker-build-cluster.sh @@ -53,18 +53,6 @@ mkdir -p deploy/docker/.build/charts echo "Packaging helm chart..." helm package deploy/helm/openshell -d deploy/docker/.build/charts/ -# Download nvidia-device-plugin chart for GPU support (bundled to avoid -# dependency on the upstream GitHub Pages Helm repo at cluster start time) -NVIDIA_DP_VERSION="0.18.2" -NVIDIA_DP_CHART="deploy/docker/.build/charts/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" -if [ ! -f "$NVIDIA_DP_CHART" ]; then - echo "Downloading nvidia-device-plugin chart v${NVIDIA_DP_VERSION}..." - curl -fsSL "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/gh-pages/stable/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" \ - -o "$NVIDIA_DP_CHART" || { - echo "Warning: failed to download nvidia-device-plugin chart; GPU support may not work" - } -fi - # Build cluster image (no bundled component images — they are pulled at runtime # from the distribution registry; credentials are injected at deploy time) echo "Building cluster image..." diff --git a/tasks/scripts/docker-publish-multiarch.sh b/tasks/scripts/docker-publish-multiarch.sh index 7395ed0c..7bb6dc84 100755 --- a/tasks/scripts/docker-publish-multiarch.sh +++ b/tasks/scripts/docker-publish-multiarch.sh @@ -176,18 +176,6 @@ mkdir -p deploy/docker/.build/charts echo "Packaging helm chart..." helm package deploy/helm/openshell -d deploy/docker/.build/charts/ -# Download nvidia-device-plugin chart for GPU support (bundled to avoid -# dependency on the upstream GitHub Pages Helm repo at cluster start time) -NVIDIA_DP_VERSION="0.18.2" -NVIDIA_DP_CHART="deploy/docker/.build/charts/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" -if [ ! -f "$NVIDIA_DP_CHART" ]; then - echo "Downloading nvidia-device-plugin chart v${NVIDIA_DP_VERSION}..." - curl -fsSL "https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/gh-pages/stable/nvidia-device-plugin-${NVIDIA_DP_VERSION}.tgz" \ - -o "$NVIDIA_DP_CHART" || { - echo "Warning: failed to download nvidia-device-plugin chart; GPU support may not work" - } -fi - # --------------------------------------------------------------------------- # Step 3: Build and push multi-arch cluster image. # The cluster image includes the supervisor binary (built from Rust source) From dae043b80371d9a395ab4ef927565fb9c7ccf481 Mon Sep 17 00:00:00 2001 From: tyeth Date: Tue, 17 Mar 2026 20:51:42 +0000 Subject: [PATCH 3/4] fix(gpu): add WSL2 GPU support via CDI mode WSL2 virtualises GPU access through /dev/dxg instead of native /dev/nvidia* device nodes, which breaks the entire NVIDIA k8s device plugin detection chain. This patch detects WSL2 at container startup and applies fixes: 1. Generate CDI spec with nvidia-ctk (auto-detects WSL mode) 2. Add per-GPU UUID and index device entries to CDI spec (nvidia-ctk only generates name=all but the device plugin assigns GPUs by UUID) 3. Bump CDI spec version from 0.3.0 to 0.5.0 (library minimum) 4. Patch the spec to include libdxcore.so (nvidia-ctk bug omits it; this library bridges Linux NVML to the Windows DirectX GPU Kernel) 5. Switch nvidia-container-runtime from auto to cdi mode 6. Deploy a job to label the node with pci-10de.present=true (NFD can't see NVIDIA PCI on WSL2's virtualised bus) Closes NVIDIA/OpenShell#404 --- deploy/docker/cluster-entrypoint.sh | 31 ++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 221eebb7..acf58aab 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -350,7 +350,32 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then mkdir -p /var/run/cdi nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml 2>&1 || true - # 2. Patch CDI spec: add libdxcore.so mount (nvidia-ctk misses it) + # 2. Add per-GPU device entries (UUID and index) to CDI spec. + # nvidia-ctk only generates name=all, but the device plugin + # assigns GPUs by UUID which must resolve as a CDI device. + if [ -f /var/run/cdi/nvidia.yaml ] && command -v nvidia-smi >/dev/null 2>&1; then + idx=0 + nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | while read -r uuid; do + uuid=$(echo "$uuid" | tr -d ' ') + [ -z "$uuid" ] && continue + sed -i "/- name: all/a\\ + - name: $uuid\\ + containerEdits:\\ + deviceNodes:\\ + - path: /dev/dxg\\ + - name: \"$idx\"\\ + containerEdits:\\ + deviceNodes:\\ + - path: /dev/dxg" /var/run/cdi/nvidia.yaml + idx=$((idx + 1)) + done + # nvidia-ctk cdi generate uses cdiVersion 0.3.0 but the + # installed CDI library requires >= 0.5.0 + sed -i 's/cdiVersion: 0\.3\.0/cdiVersion: 0.5.0/' /var/run/cdi/nvidia.yaml + echo "CDI spec: added per-GPU UUID and index device entries" + fi + + # 4. Patch CDI spec: add libdxcore.so mount (nvidia-ctk misses it) DXCORE_PATH=$(find /usr/lib -name "libdxcore.so" 2>/dev/null | head -1) if [ -n "$DXCORE_PATH" ] && [ -f /var/run/cdi/nvidia.yaml ]; then DXCORE_DIR=$(dirname "$DXCORE_PATH") @@ -372,14 +397,14 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then fi fi - # 3. Switch nvidia container runtime to CDI mode + # 5. Switch nvidia container runtime to CDI mode NVIDIA_RUNTIME_CONFIG="/etc/nvidia-container-runtime/config.toml" if [ -f "$NVIDIA_RUNTIME_CONFIG" ]; then sed -i 's/mode = "auto"/mode = "cdi"/' "$NVIDIA_RUNTIME_CONFIG" echo "nvidia-container-runtime switched to CDI mode" fi - # 4. Create a k3s manifest to label the node with NVIDIA PCI vendor + # 6. Create a k3s manifest to label the node with NVIDIA PCI vendor # (NFD can't detect it on WSL2 since PCI topology is virtualised) cat > "$K3S_MANIFESTS/wsl2-gpu-node-label.yaml" <<'WSLEOF' apiVersion: batch/v1 From e2b9f10e021ce09011b8ceaaf801c6ed6ef8af08 Mon Sep 17 00:00:00 2001 From: tyeth Date: Tue, 17 Mar 2026 23:47:14 +0000 Subject: [PATCH 4/4] fix(gpu): write complete CDI spec instead of fragile sed patching The previous approach used sed to inject GPU UUID entries and libdxcore.so mounts into the nvidia-ctk-generated CDI spec. This corrupted the YAML structure (duplicate containerEdits keys) causing CDI device resolution to fail with "failed to unmarshal CDI Spec". Replace with writing the complete CDI spec from scratch using a heredoc. This is more robust and easier to understand. The spec includes: - /dev/dxg device node - Per-GPU entries by UUID and index (for device plugin allocation) - libdxcore.so mount (missing from nvidia-ctk on WSL2) - All WSL driver store library mounts - ldcache update hooks for both driver store and libdxcore directories Tested end-to-end: nemoclaw onboard -> gateway start -> WSL2 fix -> sandbox create with GPU -> nvidia-smi working inside sandbox pod. --- deploy/docker/cluster-entrypoint.sh | 128 ++++++++++++++++++---------- 1 file changed, 83 insertions(+), 45 deletions(-) diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index acf58aab..1e0f8ca8 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -345,55 +345,93 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then if [ -c /dev/dxg ]; then echo "WSL2 detected (/dev/dxg present) — configuring CDI mode for GPU" - # 1. Generate CDI spec (nvidia-ctk auto-detects WSL mode) - if command -v nvidia-ctk >/dev/null 2>&1; then + # 1. Build a complete CDI spec from scratch. + # nvidia-ctk cdi generate has two WSL2 bugs: + # a) only creates name=all but the device plugin assigns by UUID + # b) misses libdxcore.so (the NVML-to-DXG bridge library) + # Writing the spec directly avoids fragile sed patching of YAML. + if command -v nvidia-ctk >/dev/null 2>&1 && command -v nvidia-smi >/dev/null 2>&1; then mkdir -p /var/run/cdi - nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml 2>&1 || true - - # 2. Add per-GPU device entries (UUID and index) to CDI spec. - # nvidia-ctk only generates name=all, but the device plugin - # assigns GPUs by UUID which must resolve as a CDI device. - if [ -f /var/run/cdi/nvidia.yaml ] && command -v nvidia-smi >/dev/null 2>&1; then - idx=0 - nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | while read -r uuid; do - uuid=$(echo "$uuid" | tr -d ' ') - [ -z "$uuid" ] && continue - sed -i "/- name: all/a\\ - - name: $uuid\\ - containerEdits:\\ - deviceNodes:\\ - - path: /dev/dxg\\ - - name: \"$idx\"\\ - containerEdits:\\ - deviceNodes:\\ - - path: /dev/dxg" /var/run/cdi/nvidia.yaml - idx=$((idx + 1)) - done - # nvidia-ctk cdi generate uses cdiVersion 0.3.0 but the - # installed CDI library requires >= 0.5.0 - sed -i 's/cdiVersion: 0\.3\.0/cdiVersion: 0.5.0/' /var/run/cdi/nvidia.yaml - echo "CDI spec: added per-GPU UUID and index device entries" - fi - # 4. Patch CDI spec: add libdxcore.so mount (nvidia-ctk misses it) + GPU_UUID=$(nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | tr -d ' ' | head -1) DXCORE_PATH=$(find /usr/lib -name "libdxcore.so" 2>/dev/null | head -1) - if [ -n "$DXCORE_PATH" ] && [ -f /var/run/cdi/nvidia.yaml ]; then - DXCORE_DIR=$(dirname "$DXCORE_PATH") - # Insert libdxcore mount after the mounts: key - sed -i "/^ mounts:/a\\ - - hostPath: $DXCORE_PATH\\ - containerPath: $DXCORE_PATH\\ - options:\\ - - ro\\ - - nosuid\\ - - nodev\\ - - rbind\\ - - rprivate" /var/run/cdi/nvidia.yaml - # Add ldcache folder for libdxcore directory - sed -i "s|update-ldcache|update-ldcache\n - --folder\n - $DXCORE_DIR|" /var/run/cdi/nvidia.yaml - echo "CDI spec patched with libdxcore.so from $DXCORE_PATH" + DXCORE_DIR=$(dirname "$DXCORE_PATH" 2>/dev/null || echo "/usr/lib/x86_64-linux-gnu") + DRIVER_DIR=$(ls -d /usr/lib/wsl/drivers/nv*.inf_amd64_* 2>/dev/null | head -1) + + if [ -n "$DRIVER_DIR" ] && [ -n "$GPU_UUID" ]; then + cat > /var/run/cdi/nvidia.yaml <