From c32536f3f4b3742cb6e37f238de473ae0fe84e5b Mon Sep 17 00:00:00 2001 From: Dhinesh Ponnarasan Date: Mon, 23 Mar 2026 01:58:36 -0400 Subject: [PATCH 1/2] Fix D3cold re-entry after AC replug by bounding idle holdoff rescheduling --- kernel-open/nvidia/nv-acpi.c | 5 +++ kernel-open/nvidia/nv.c | 43 +++++++++++-------- .../arch/nvalloc/unix/include/nv-priv.h | 13 +++--- .../arch/nvalloc/unix/src/dynamic-power.c | 43 ++++++++++++++++++- src/nvidia/arch/nvalloc/unix/src/osapi.c | 6 +++ 5 files changed, 83 insertions(+), 27 deletions(-) diff --git a/kernel-open/nvidia/nv-acpi.c b/kernel-open/nvidia/nv-acpi.c index d0fe94a03..0b46ed1a7 100644 --- a/kernel-open/nvidia/nv-acpi.c +++ b/kernel-open/nvidia/nv-acpi.c @@ -140,6 +140,11 @@ static void nv_acpi_powersource_hotplug_event(acpi_handle handle, u32 event_type if (nv_acpi_get_powersource(&ac_plugged) != NV_OK) return; + nv_printf(NV_DBG_INFO, + "NVRM: [RTD3] ACPI power-source event: type=0x%x, ac_plugged=%u " + "(calling rm_power_source_change_event with battery=%u)\n", + event_type, ac_plugged, !ac_plugged); + rm_power_source_change_event(pNvAcpiObject->sp, !ac_plugged); } } diff --git a/kernel-open/nvidia/nv.c b/kernel-open/nvidia/nv.c index c199fd9e6..a8223203d 100644 --- a/kernel-open/nvidia/nv.c +++ b/kernel-open/nvidia/nv.c @@ -5085,9 +5085,15 @@ nvidia_transition_dynamic_power( if ((nv->flags & (NV_FLAG_INITIALIZED | NV_FLAG_PERSISTENT_SW_STATE)) == 0) { + dev_info(dev, "NVRM: [RTD3] transition_dynamic_power: %s skipped (not initialized)\n", + enter ? "enter(suspend)" : "exit(resume)"); return 0; } + dev_info(dev, "NVRM: [RTD3] transition_dynamic_power: %s begin, usage_count=%d\n", + enter ? "enter(suspend)" : "exit(resume)", + atomic_read(&dev->power.usage_count)); + if (nv_kmem_cache_alloc_stack(&sp) != 0) { return -ENOMEM; @@ -5097,6 +5103,9 @@ nvidia_transition_dynamic_power( nv_kmem_cache_free_stack(sp); + dev_info(dev, "NVRM: [RTD3] transition_dynamic_power: %s done, status=%d, bTryAgain=%d\n", + enter ? "enter(suspend)" : "exit(resume)", status, bTryAgain); + if (bTryAgain) { /* @@ -5118,6 +5127,9 @@ int nv_pmops_runtime_suspend( nv_linux_state_t *nvl = pci_get_drvdata(pci_dev); nv_state_t *nv = NV_STATE_PTR(nvl); + dev_info(dev, "NVRM: [RTD3] pmops_runtime_suspend: entry, usage_count=%d\n", + atomic_read(&dev->power.usage_count)); + #if defined(CONFIG_PM_DEVFREQ) if (nvl->devfreq_suspend != NULL) { @@ -5145,9 +5157,11 @@ int nv_pmops_runtime_suspend( } } + dev_info(dev, "NVRM: [RTD3] pmops_runtime_suspend: exit ok, err=%d\n", err); return err; nv_pmops_runtime_suspend_exit: + dev_info(dev, "NVRM: [RTD3] pmops_runtime_suspend: exit error, err=%d\n", err); #if defined(CONFIG_PM_DEVFREQ) if (nvl->devfreq_resume != NULL) { @@ -5169,6 +5183,9 @@ int nv_pmops_runtime_resume( nv_pci_tegra_boost_clocks(dev); #endif + dev_info(dev, "NVRM: [RTD3] pmops_runtime_resume: entry, usage_count=%d\n", + atomic_read(&dev->power.usage_count)); + err = nvidia_transition_dynamic_power(dev, NV_FALSE); if (err) { @@ -5551,6 +5568,8 @@ NV_STATUS NV_API_CALL nv_indicate_idle( char buf; pm_runtime_put_noidle(dev); + dev_info(dev, "NVRM: [RTD3] nv_indicate_idle: pm_runtime_put_noidle, usage_count=%d\n", + atomic_read(&dev->power.usage_count)); #if defined(NV_SEQ_READ_ITER_PRESENT) { @@ -5591,6 +5610,8 @@ NV_STATUS NV_API_CALL nv_indicate_not_idle( struct device *dev = nvl->dev; pm_runtime_get_noresume(dev); + dev_info(dev, "NVRM: [RTD3] nv_indicate_not_idle: pm_runtime_get_noresume, usage_count=%d\n", + atomic_read(&dev->power.usage_count)); nvl->is_forced_shutdown = NV_TRUE; pci_bus_type.shutdown(dev); @@ -6147,15 +6168,8 @@ void NV_API_CALL nv_allow_runtime_suspend nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv); struct device *dev = nvl->dev; - spin_lock_irq(&dev->power.lock); - - if (dev->power.runtime_auto == false) - { - dev->power.runtime_auto = true; - atomic_add_unless(&dev->power.usage_count, -1, 0); - } - - spin_unlock_irq(&dev->power.lock); + dev_info(dev, "NVRM: [RTD3] nv_allow_runtime_suspend: enabling runtime PM\n"); + pm_runtime_allow(dev); #endif } @@ -6168,15 +6182,8 @@ void NV_API_CALL nv_disallow_runtime_suspend nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv); struct device *dev = nvl->dev; - spin_lock_irq(&dev->power.lock); - - if (dev->power.runtime_auto == true) - { - dev->power.runtime_auto = false; - atomic_inc(&dev->power.usage_count); - } - - spin_unlock_irq(&dev->power.lock); + dev_info(dev, "NVRM: [RTD3] nv_disallow_runtime_suspend: disabling runtime PM\n"); + pm_runtime_forbid(dev); #endif } diff --git a/src/nvidia/arch/nvalloc/unix/include/nv-priv.h b/src/nvidia/arch/nvalloc/unix/include/nv-priv.h index a3dcebfd3..42e108b38 100644 --- a/src/nvidia/arch/nvalloc/unix/include/nv-priv.h +++ b/src/nvidia/arch/nvalloc/unix/include/nv-priv.h @@ -279,15 +279,12 @@ typedef struct nv_dynamic_power_s NvBool b_idle_sustained_workitem_queued; /* - * Counter to track clients disallowing GCOFF. + * Counter to track how many times RmRemoveIdleHoldoff has rescheduled itself. + * Used to prevent infinite rescheduling when GC6 prerequisites cannot be met. + * After exceeding MAX_IDLE_HOLDOFF_RESCHEDULES, nv_indicate_idle is called + * unconditionally to allow D3cold entry via autosuspend path. */ - NvU32 clients_gcoff_disallow_refcount; - - /* - * Maximum FB allocation size which can be saved in system memory - * while doing GCOFF based dynamic PM. - */ - NvU64 gcoff_max_fb_size; + NvU32 idle_holdoff_reschedule_count; /* * NVreg_DynamicPowerManagement regkey value set by the user diff --git a/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c b/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c index c465e0f89..107c0ec06 100644 --- a/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c +++ b/src/nvidia/arch/nvalloc/unix/src/dynamic-power.c @@ -154,6 +154,17 @@ static NvU32 dynamicPowerSupportGpuMask = 0; // #define IGPU_RG_BLOCKER_CHECK_AND_METHOD_FLUSH_TIME (700 * 1000 * 1000) +// +// Maximum number of times RmRemoveIdleHoldoff() can reschedule itself +// before forcefully calling nv_indicate_idle(). This prevents infinite +// rescheduling when GC6 prerequisites cannot be met (e.g., AC power mode +// blocks GC6 entry). After this threshold, D3cold entry is forced via the +// autosuspend path (without GC6 entry checks). +// With GC6_PRECONDITION_CHECK_TIME = 5 seconds: +// 4 reschedules = ~20 seconds of waiting before fallback to autosuspend +// +#define MAX_IDLE_HOLDOFF_RESCHEDULES 4 + // // Cap Maximum FB allocation size for GCOFF. If regkey value is greater // than this value then it will be capped to this value. @@ -1137,6 +1148,10 @@ os_ref_dynamic_power( ref = nvp->dynamic_power.refcount++; + NV_PRINTF(LEVEL_INFO, "[RTD3] os_ref_dynamic_power: mode=%d, refcount %d->%d, state=%s\n", + mode, ref, ref + 1, + nv_dynamic_power_state_string(nvp->dynamic_power.state)); + NV_ASSERT(ref >= 0); if (ref > 0) @@ -1321,6 +1336,10 @@ os_unref_dynamic_power( ref = --nvp->dynamic_power.refcount; + NV_PRINTF(LEVEL_INFO, "[RTD3] os_unref_dynamic_power: mode=%d, refcount %d->%d, state=%s\n", + mode, ref + 1, ref, + nv_dynamic_power_state_string(nvp->dynamic_power.state)); + NV_ASSERT(ref >= 0); if (ref == 0) { @@ -1463,11 +1482,30 @@ static void RmRemoveIdleHoldoff( { nv_indicate_idle(nv); nvp->dynamic_power.b_idle_holdoff = NV_FALSE; + nvp->dynamic_power.idle_holdoff_reschedule_count = 0; // Reset counter on success } - else + /* + * Prevent infinite rescheduling when GC6 is unavailable (e.g., AC mode). + * After a limited number of retries, force nv_indicate_idle() + * to allow runtime suspend fallback (D3 entry without GC6). + */ + else if (nvp->dynamic_power.idle_holdoff_reschedule_count < MAX_IDLE_HOLDOFF_RESCHEDULES) { + // Increment reschedule counter and reschedule + nvp->dynamic_power.idle_holdoff_reschedule_count++; RmScheduleCallbackToRemoveIdleHoldoff(pGpu); } + else + { + // Max reschedules reached: force nv_indicate_idle to break infinite loop + // This allows D3cold entry via autosuspend path even when GC6 is not available + NV_PRINTF(LEVEL_WARNING, + "NVRM: [RTD3] RmRemoveIdleHoldoff: GC6 unavailable after %d reschedules, forcing nv_indicate_idle\n", + MAX_IDLE_HOLDOFF_RESCHEDULES); + nv_indicate_idle(nv); + nvp->dynamic_power.b_idle_holdoff = NV_FALSE; + nvp->dynamic_power.idle_holdoff_reschedule_count = 0; // Reset counter + } } } @@ -1798,6 +1836,7 @@ void RmDestroyDeferredDynamicPowerManagement( { nv_indicate_idle(nv); nvp->dynamic_power.b_idle_holdoff = NV_FALSE; + nvp->dynamic_power.idle_holdoff_reschedule_count = 0; // Reset counter } RmCancelDynamicPowerCallbacks(pGpu); @@ -2053,6 +2092,7 @@ static void RmScheduleCallbackToRemoveIdleHoldoff( else { nvp->dynamic_power.b_idle_holdoff = NV_TRUE; + nvp->dynamic_power.idle_holdoff_reschedule_count = 0; // Initialize counter } } } @@ -2612,6 +2652,7 @@ NV_STATUS NV_API_CALL rm_power_management( nv_indicate_idle(pNv); RmCancelCallbackToRemoveIdleHoldoff(pGpu); nvp->dynamic_power.b_idle_holdoff = NV_FALSE; + nvp->dynamic_power.idle_holdoff_reschedule_count = 0; // Reset counter } // diff --git a/src/nvidia/arch/nvalloc/unix/src/osapi.c b/src/nvidia/arch/nvalloc/unix/src/osapi.c index 7a81680c6..0590d7ec2 100644 --- a/src/nvidia/arch/nvalloc/unix/src/osapi.c +++ b/src/nvidia/arch/nvalloc/unix/src/osapi.c @@ -4463,10 +4463,16 @@ void NV_API_CALL rm_power_source_change_event( // state has actually changed. If not, return without waking the GPU. if ((first_event_seen != NV_FALSE) && (last_event_val == event_val)) { + NV_PRINTF(LEVEL_INFO, "[RTD3] rm_power_source_change_event: " + "suppressed duplicate event_val=%u (%s)\n", + event_val, event_val ? "battery" : "AC"); return; } else { + NV_PRINTF(LEVEL_INFO, "[RTD3] rm_power_source_change_event: " + "processing event_val=%u (%s)\n", + event_val, event_val ? "battery" : "AC"); first_event_seen = NV_TRUE; last_event_val = event_val; } From e6f3c3a80263c7481e64e78a1351027aba78e282 Mon Sep 17 00:00:00 2001 From: Dhinesh Ponnarasan Date: Mon, 23 Mar 2026 06:15:42 -0400 Subject: [PATCH 2/2] Fix build: restore nv_dynamic_power_s GCOFF fields --- src/nvidia/arch/nvalloc/unix/include/nv-priv.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/nvidia/arch/nvalloc/unix/include/nv-priv.h b/src/nvidia/arch/nvalloc/unix/include/nv-priv.h index 42e108b38..d3ef1afa8 100644 --- a/src/nvidia/arch/nvalloc/unix/include/nv-priv.h +++ b/src/nvidia/arch/nvalloc/unix/include/nv-priv.h @@ -286,6 +286,17 @@ typedef struct nv_dynamic_power_s */ NvU32 idle_holdoff_reschedule_count; + /* + * Counter to track clients disallowing GCOFF. + */ + NvU32 clients_gcoff_disallow_refcount; + + /* + * Maximum FB allocation size which can be saved in system memory + * while doing GCOFF based dynamic PM. + */ + NvU64 gcoff_max_fb_size; + /* * NVreg_DynamicPowerManagement regkey value set by the user */