NVIDIA · elezar · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
@@ -296,8 +296,10 @@ When environment variables are set, the entrypoint modifies the HelmChart manife
 
 GPU support is part of the single-node gateway bootstrap path rather than a separate architecture.
 
-- `openshell gateway start --gpu` threads a boolean deploy option through `crates/openshell-cli`, `crates/openshell-bootstrap`, and `crates/openshell-bootstrap/src/docker.rs`.
-- When enabled, the cluster container is created with Docker `DeviceRequests`, which is the API equivalent of `docker run --gpus all`.
+- `openshell gateway start --gpu` threads GPU device options through `crates/openshell-cli`, `crates/openshell-bootstrap`, and `crates/openshell-bootstrap/src/docker.rs`.
+- When enabled, the cluster container is created with Docker `DeviceRequests`. The injection mechanism is selected based on whether CDI is enabled on the daemon (`SystemInfo.CDISpecDirs` via `GET /info`):
+  - **CDI enabled** (daemon reports non-empty `CDISpecDirs`): CDI device injection — `driver="cdi"` with `nvidia.com/gpu=all`. Specs are expected to be pre-generated on the host (e.g. automatically by the `nvidia-cdi-refresh.service` or manually via `nvidia-ctk generate`).
+  - **CDI not enabled**: `--gpus all` device request — `driver="nvidia"`, `count=-1`, which relies on the NVIDIA Container Runtime hook.
 - `deploy/docker/Dockerfile.images` installs NVIDIA Container Toolkit packages in a dedicated Ubuntu stage and copies the runtime binaries, config, and `libnvidia-container` shared libraries into the final Ubuntu-based cluster image.
 - `deploy/docker/cluster-entrypoint.sh` checks `GPU_ENABLED=true` and copies GPU-only manifests from `/opt/openshell/gpu-manifests/` into k3s's manifests directory.
 - `deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml` installs the NVIDIA device plugin chart, currently pinned to `0.18.2`, along with GPU Feature Discovery and Node Feature Discovery.
@@ -308,12 +310,24 @@ The runtime chain is:
 
 ```text
 Host GPU drivers & NVIDIA Container Toolkit
-    └─ Docker: --gpus all (DeviceRequests in bollard API)
+    └─ Docker: DeviceRequests (CDI when enabled, --gpus all otherwise)
         └─ k3s/containerd: nvidia-container-runtime on PATH -> auto-detected
             └─ k8s: nvidia-device-plugin DaemonSet advertises nvidia.com/gpu
                 └─ Pods: request nvidia.com/gpu in resource limits
 ```
 
+### `--gpu` / `--device` flag
+
+The `--gpu` flag (aliased as `--device`) on `gateway start` accepts an optional value that overrides the automatic injection mode:
+
+| Invocation | Behaviour |
+|---|---|
+| `--gpu` | Auto-select: CDI when enabled on the daemon, `--gpus all` otherwise |
+| `--gpu=legacy` | Force `--gpus all` |
+| `--gpu=<cdi-device>` | Inject a specific CDI device (e.g. `nvidia.com/gpu=all`). May be repeated for multiple devices. Note: because the cluster container runs privileged, device-level isolation may not work as expected. |
+
+Mixing `legacy` or auto-select with explicit CDI device names in the same invocation is an error.
+
 The expected smoke test is a plain pod requesting `nvidia.com/gpu: 1` with `runtimeClassName: nvidia` and running `nvidia-smi`.
 
 ## Remote Image Transfer

@@ -22,6 +22,39 @@ use std::collections::HashMap;
 
 const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell";
 
+/// Returns true if the Docker daemon has CDI enabled.
+///
+/// CDI is considered enabled when the daemon reports at least one CDI spec
+/// directory via `GET /info` (`SystemInfo.CDISpecDirs`). An empty list or a
+/// missing field means CDI is not configured, and we fall back to the legacy
+/// NVIDIA `DeviceRequest` (driver="nvidia").
+fn cdi_enabled(cdi_spec_dirs: Option<&[String]>) -> bool {
+    cdi_spec_dirs.is_some_and(|dirs| !dirs.is_empty())
+}
+
+/// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a
+/// concrete device ID based on whether CDI is enabled on the daemon.
+///
+/// | Input        | Output                                                       |
+/// |--------------|--------------------------------------------------------------|
+/// | `[]`         | `[]`  — no GPU                                               |
+/// | `["legacy"]` | `["legacy"]`  — pass through                                 |
+/// | `["auto"]`   | `["nvidia.com/gpu=all"]` if CDI enabled, else `["legacy"]`   |
+/// | `[cdi-ids…]` | unchanged                                                    |
+pub(crate) fn resolve_gpu_device_ids(gpu: &[String], cdi_enabled: bool) -> Vec<String> {
+    match gpu {
+        [] => vec![],
+        [v] if v == "auto" => {
+            if cdi_enabled {
+                vec!["nvidia.com/gpu=all".to_string()]
+            } else {
+                vec!["legacy".to_string()]
+            }
+        }
+        other => other.to_vec(),
+    }
+}
+
 const REGISTRY_MODE_EXTERNAL: &str = "external";
 
 fn env_non_empty(key: &str) -> Option<String> {
@@ -454,7 +487,7 @@ pub async fn ensure_container(
     disable_gateway_auth: bool,
     registry_username: Option<&str>,
     registry_token: Option<&str>,
-    gpu: bool,
+    device_ids: &[String],
 ) -> Result<()> {
     let container_name = container_name(name);
 
@@ -542,21 +575,35 @@ pub async fn ensure_container(
         ..Default::default()
     };
 
-    // When GPU support is requested, add NVIDIA device requests.
-    // This is the programmatic equivalent of `docker run --gpus all`.
-    // The NVIDIA Container Toolkit runtime hook injects /dev/nvidia* devices
-    // and GPU driver libraries from the host into the container.
-    if gpu {
-        host_config.device_requests = Some(vec![DeviceRequest {
-            driver: Some("nvidia".to_string()),
-            count: Some(-1), // all GPUs
-            capabilities: Some(vec![vec![
-                "gpu".to_string(),
-                "utility".to_string(),
-                "compute".to_string(),
-            ]]),
-            ..Default::default()
-        }]);
+    // Inject GPU devices into the container based on the resolved device ID list.
+    //
+    // The list is pre-resolved by `resolve_gpu_device_ids` before reaching here:
+    //   []           — no GPU passthrough
+    //   ["legacy"]   — legacy nvidia DeviceRequest (driver="nvidia", count=-1);
+    //                  relies on the NVIDIA Container Runtime hook
+    //   [cdi-ids…]   — CDI DeviceRequest (driver="cdi") with the given device IDs;
+    //                  Docker resolves them against the host CDI spec at /etc/cdi/
+    match device_ids {
+        [] => {}
+        [id] if id == "legacy" => {
+            host_config.device_requests = Some(vec![DeviceRequest {
+                driver: Some("nvidia".to_string()),
+                count: Some(-1), // all GPUs
+                capabilities: Some(vec![vec![
+                    "gpu".to_string(),
+                    "utility".to_string(),
+                    "compute".to_string(),
+                ]]),
+                ..Default::default()
+            }]);
+        }
+        ids => {
+            host_config.device_requests = Some(vec![DeviceRequest {
+                driver: Some("cdi".to_string()),
+                device_ids: Some(ids.to_vec()),
+                ..Default::default()
+            }]);
+        }
     }
 
     let mut cmd = vec![
@@ -671,7 +718,7 @@ pub async fn ensure_container(
 
     // GPU support: tell the entrypoint to deploy the NVIDIA device plugin
     // HelmChart CR so k8s workloads can request nvidia.com/gpu resources.
-    if gpu {
+    if !device_ids.is_empty() {
         env_vars.push("GPU_ENABLED=true".to_string());
     }
 
@@ -1195,4 +1242,75 @@ mod tests {
             "should return a reasonable number of sockets"
         );
     }
+
+    #[test]
+    fn cdi_enabled_with_spec_dirs() {
+        let dirs = vec!["/etc/cdi".to_string(), "/var/run/cdi".to_string()];
+        assert!(cdi_enabled(Some(&dirs)));
+    }
+
+    #[test]
+    fn cdi_enabled_with_single_spec_dir() {
+        let dirs = vec!["/etc/cdi".to_string()];
+        assert!(cdi_enabled(Some(&dirs)));
+    }
+
+    #[test]
+    fn cdi_enabled_with_empty_spec_dirs() {
+        assert!(!cdi_enabled(Some(&[])));
+    }
+
+    #[test]
+    fn cdi_enabled_with_none() {
+        assert!(!cdi_enabled(None));
+    }
+
+    // --- resolve_gpu_device_ids ---
+
+    #[test]
+    fn resolve_gpu_empty_returns_empty() {
+        assert_eq!(resolve_gpu_device_ids(&[], true), Vec::<String>::new());
+        assert_eq!(resolve_gpu_device_ids(&[], false), Vec::<String>::new());
+    }
+
+    #[test]
+    fn resolve_gpu_auto_cdi_enabled() {
+        assert_eq!(
+            resolve_gpu_device_ids(&["auto".to_string()], true),
+            vec!["nvidia.com/gpu=all"],
+        );
+    }
+
+    #[test]
+    fn resolve_gpu_auto_cdi_disabled() {
+        assert_eq!(
+            resolve_gpu_device_ids(&["auto".to_string()], false),
+            vec!["legacy"],
+        );
+    }
+
+    #[test]
+    fn resolve_gpu_legacy_passthrough() {
+        assert_eq!(
+            resolve_gpu_device_ids(&["legacy".to_string()], true),
+            vec!["legacy"],
+        );
+        assert_eq!(
+            resolve_gpu_device_ids(&["legacy".to_string()], false),
+            vec!["legacy"],
+        );
+    }
+
+    #[test]
+    fn resolve_gpu_cdi_ids_passthrough() {
+        let ids = vec!["nvidia.com/gpu=all".to_string()];
+        assert_eq!(resolve_gpu_device_ids(&ids, true), ids);
+        assert_eq!(resolve_gpu_device_ids(&ids, false), ids);
+
+        let multi = vec![
+            "nvidia.com/gpu=0".to_string(),
+            "nvidia.com/gpu=1".to_string(),
+        ];
+        assert_eq!(resolve_gpu_device_ids(&multi, true), multi);
+    }
 }
@@ -31,7 +31,8 @@ use crate::constants::{
 };
 use crate::docker::{
     check_existing_gateway, check_port_conflicts, destroy_gateway_resources, ensure_container,
-    ensure_image, ensure_network, ensure_volume, start_container, stop_container,
+    ensure_image, ensure_network, ensure_volume, resolve_gpu_device_ids, start_container,
+    stop_container,
 };
 use crate::metadata::{
     create_gateway_metadata, create_gateway_metadata_with_host, local_gateway_host,
@@ -111,10 +112,13 @@ pub struct DeployOptions {
     /// bootstrap pull and inside the k3s cluster at runtime. Only needed
     /// for private registries.
     pub registry_token: Option<String>,
-    /// Enable NVIDIA GPU passthrough. When true, the Docker container is
-    /// created with GPU device requests (`--gpus all`) and the NVIDIA
-    /// k8s-device-plugin is deployed inside the k3s cluster.
-    pub gpu: bool,
+    /// GPU device IDs to inject into the gateway container.
+    ///
+    /// - `[]`          — no GPU passthrough (default)
+    /// - `["legacy"]`  — legacy nvidia DeviceRequest (driver="nvidia", count=-1)
+    /// - `["auto"]`    — resolved at deploy time: CDI if enabled on the daemon, else legacy
+    /// - `[cdi-ids…]`  — CDI DeviceRequest with the given device IDs
+    pub gpu: Vec<String>,
     /// When true, destroy any existing gateway resources before deploying.
     /// When false, an existing gateway is left as-is and deployment is
     /// skipped (the caller is responsible for prompting the user first).
@@ -133,7 +137,7 @@ impl DeployOptions {
             disable_gateway_auth: false,
             registry_username: None,
             registry_token: None,
-            gpu: false,
+            gpu: vec![],
             recreate: false,
         }
     }
@@ -187,9 +191,13 @@ impl DeployOptions {
         self
     }
 
-    /// Enable NVIDIA GPU passthrough for the cluster container.
+    /// Set GPU device IDs for the cluster container.
+    ///
+    /// Pass `vec!["auto"]` to auto-select between CDI and legacy based on Docker
+    /// version at deploy time, or an explicit list of CDI device IDs, or
+    /// `vec!["legacy"]` to force the legacy nvidia DeviceRequest.
     #[must_use]
-    pub fn with_gpu(mut self, gpu: bool) -> Self {
+    pub fn with_gpu(mut self, gpu: Vec<String>) -> Self {
         self.gpu = gpu;
         self
     }
@@ -288,6 +296,15 @@ where
         (preflight.docker, None)
     };
 
+    // CDI support is detected via SystemInfo.CDISpecDirs (best-effort — failure
+    // is non-fatal and results in a legacy GPU injection fallback).
+    let cdi_supported = target_docker
+        .info()
+        .await
+        .ok()
+        .and_then(|info| info.cdi_spec_dirs)
+        .is_some_and(|dirs| !dirs.is_empty());
+
     // If an existing gateway is found, either tear it down (when recreate is
     // requested) or bail out so the caller can prompt the user / reuse it.
     if let Some(existing) = check_existing_gateway(&target_docker, &name).await? {
@@ -405,6 +422,7 @@ where
     // leaving an orphaned volume in a corrupted state that blocks retries.
     // See: https://github.com/NVIDIA/OpenShell/issues/463
     let deploy_result: Result<GatewayMetadata> = async {
+        let device_ids = resolve_gpu_device_ids(&gpu, cdi_supported);
         ensure_container(
             &target_docker,
             &name,
@@ -416,7 +434,7 @@ where
             disable_gateway_auth,
             registry_username.as_deref(),
             registry_token.as_deref(),
-            gpu,
+            &device_ids,
         )
         .await?;
         start_container(&target_docker, &name).await?;

@@ -178,7 +178,11 @@ pub async fn run_bootstrap(
     {
         options = options.with_gateway_host(host);
     }
-    options = options.with_gpu(gpu);
+    options = options.with_gpu(if gpu {
+        vec!["auto".to_string()]
+    } else {
+        vec![]
+    });
 
     let handle = deploy_gateway_with_panel(options, &gateway_name, location).await?;
     let server = handle.gateway_endpoint().to_string();