From cd8313f0204213db6efe5a4b6f047e72ad81775a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 5 Nov 2025 10:54:07 +0100 Subject: [PATCH 1/3] UPSTREAM: smp: Introduce a helper function to check for pending IPIs When governors used during cpuidle try to find the most optimal idle state for a CPU or a group of CPUs, they are known to quite often fail. One reason for this is, that they are not taking into account whether there has been an IPI scheduled for any of the CPUs that are affected by the selected idle state. To enable pending IPIs to be taken into account for cpuidle decisions, introduce a new helper function, cpus_peek_for_pending_ipi(). Link: https://lore.kernel.org/all/20251105095415.17269-2-ulf.hansson@linaro.org/ Suggested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Signed-off-by: Ulf Hansson Signed-off-by: Sneh Mankad --- include/linux/smp.h | 5 +++++ kernel/smp.c | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index 18e9c918325e5..91d0ecf3b8d3b 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -168,6 +168,7 @@ int smp_call_function_any(const struct cpumask *mask, void kick_all_cpus_sync(void); void wake_up_all_idle_cpus(void); +bool cpus_peek_for_pending_ipi(const struct cpumask *mask); /* * Generic and arch helpers @@ -216,6 +217,10 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } +static inline bool cpus_peek_for_pending_ipi(const struct cpumask *mask) +{ + return false; +} #define setup_max_cpus 0 diff --git a/kernel/smp.c b/kernel/smp.c index 02f52291fae42..f349960f79cad 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1087,6 +1087,28 @@ void wake_up_all_idle_cpus(void) } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); +/** + * cpus_peek_for_pending_ipi - Check for pending IPI for CPUs + * @mask: The CPU mask for the CPUs to check. + * + * This function walks through the @mask to check if there are any pending IPIs + * scheduled, for any of the CPUs in the @mask. It does not guarantee + * correctness as it only provides a racy snapshot. + * + * Returns true if there is a pending IPI scheduled and false otherwise. + */ +bool cpus_peek_for_pending_ipi(const struct cpumask *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (!llist_empty(per_cpu_ptr(&call_single_queue, cpu))) + return true; + } + + return false; +} + /** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct From b52423431575c07eb8e52c18db193c8230b6ccbe Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 5 Nov 2025 10:54:08 +0100 Subject: [PATCH 2/3] UPSTREAM: pmdomain: Extend the genpd governor for CPUs to account for IPIs When the genpd governor for CPUs, tries to select the most optimal idle state for a group of CPUs managed in a PM domain, it fails far too often. On a Dragonboard 410c, which is an arm64 based platform with 4 CPUs in one cluster that is using PSCI OS-initiated mode, we can observe that we often fail when trying to enter the selected idle state. This is certainly a suboptimal behaviour that leads to many unnecessary requests being sent to the PSCI FW. A simple dd operation that reads from the eMMC, to generate some IRQs and I/O handling helps us to understand the problem, while also monitoring the rejected counters in debugfs for the corresponding idle states of the genpd in question. Menu governor: cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 1451 437 91 149 0 S1 65194 558 149 172 0 dd if=/dev/mmcblk0 of=/dev/null bs=1M count=500 524288000 bytes (500.0MB) copied, 3.562698 seconds, 140.3MB/s cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 2694 1073 265 892 1 S1 74567 829 561 790 0 The dd completed in ~3.6 seconds and rejects increased with 586. Teo governor: cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 4976 2096 392 1721 2 S1 160661 1893 1309 1904 0 dd if=/dev/mmcblk0 of=/dev/null bs=1M count=500 524288000 bytes (500.0MB) copied, 3.543225 seconds, 141.1MB/s cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 5192 2194 433 1830 2 S1 167677 2891 3184 4729 0 The dd completed in ~3.6 seconds and rejects increased with 1916. The main reason to the above problem is pending IPIs for one of the CPUs that is affected by the idle state that the genpd governor selected. This leads to that the PSCI FW refuses to enter it. To improve the behaviour, let's start to take into account pending IPIs for CPUs in the genpd governor, hence we fallback to use the shallower per CPU idle state. Re-testing with this change shows a significant improved behaviour. - Menu governor: cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 2556 878 19 368 1 S1 69974 596 10 152 0 dd if=/dev/mmcblk0 of=/dev/null bs=1M count=500 524288000 bytes (500.0MB) copied, 3.522010 seconds, 142.0MB/s cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 3360 1320 28 819 1 S1 70168 710 11 267 0 The dd completed in ~3.5 seconds and rejects increased with 10. - Teo governor cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 5145 1861 39 938 1 S1 188887 3117 51 1975 0 dd if=/dev/mmcblk0 of=/dev/null bs=1M count=500 524288000 bytes (500.0MB) copied, 3.653100 seconds, 136.9MB/s cat /sys/kernel/debug/pm_genpd/power-domain-cluster/idle_states State Time Spent(ms) Usage Rejected Above Below S0 5260 1923 42 1002 1 S1 190849 4033 52 2892 0 The dd completed in ~3.7 seconds and rejects increased with 4. Note that, the rejected counters in genpd are also being accumulated in the rejected counters that are managed by cpuidle, yet on a per CPU idle states basis. Comparing these counters before/after this change, through cpuidle's sysfs interface shows the similar improvements. Link: https://lore.kernel.org/all/20251105095415.17269-3-ulf.hansson@linaro.org/ Signed-off-by: Ulf Hansson Signed-off-by: Sneh Mankad --- drivers/pmdomain/governor.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/pmdomain/governor.c b/drivers/pmdomain/governor.c index 39359811a9304..a46470f2261a9 100644 --- a/drivers/pmdomain/governor.c +++ b/drivers/pmdomain/governor.c @@ -404,15 +404,21 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd) if ((idle_duration_ns >= (genpd->states[i].residency_ns + genpd->states[i].power_off_latency_ns)) && (global_constraint >= (genpd->states[i].power_on_latency_ns + - genpd->states[i].power_off_latency_ns))) { - genpd->state_idx = i; - genpd->gd->last_enter = now; - genpd->gd->reflect_residency = true; - return true; - } + genpd->states[i].power_off_latency_ns))) + break; + } while (--i >= 0); - return false; + if (i < 0) + return false; + + if (cpus_peek_for_pending_ipi(genpd->cpus)) + return false; + + genpd->state_idx = i; + genpd->gd->last_enter = now; + genpd->gd->reflect_residency = true; + return true; } struct dev_power_governor pm_domain_cpu_gov = { From 3caa15b2812b4ccf768948cfa4f0fe7329eab5db Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Fri, 3 Apr 2026 09:38:36 +0530 Subject: [PATCH 3/3] FROMLIST: cpuidle: Deny idle entry when CPU already have IPI interrupt pending CPU can get IPI interrupt from another CPU while it is executing cpuidle_select() or about to execute same. The selection do not account for pending interrupts and may continue to enter selected idle state only to exit immediately. Example trace collected when there is cross CPU IPI. [000] 154.892148: sched_waking: comm=sugov:4 pid=491 prio=-1 target_cpu=007 [000] 154.892148: ipi_raise: target_mask=00000000,00000080 (Function call interrupts) [007] 154.892162: cpu_idle: state=2 cpu_id=7 [007] 154.892208: cpu_idle: state=4294967295 cpu_id=7 [007] 154.892211: irq_handler_entry: irq=2 name=IPI [007] 154.892211: ipi_entry: (Function call interrupts) [007] 154.892213: sched_wakeup: comm=sugov:4 pid=491 prio=-1 target_cpu=007 [007] 154.892214: ipi_exit: (Function call interrupts) This impacts performance and the above count increments. commit ccde6525183c ("smp: Introduce a helper function to check for pending IPIs") already introduced a helper function to check the pending IPIs and it is used in pmdomain governor to deny the cluster level idle state when there is a pending IPI on any of cluster CPUs. This however does not stop CPU to enter CPU level idle state. Make use of same at CPUidle to deny the idle entry when there is already IPI pending. With change observing glmark2 [1] off screen scores improving in the range of 25% to 30% on Qualcomm lemans-evk board which is arm64 based having two clusters each with 4 CPUs. [1] https://github.com/glmark2/glmark2 Link: https://lore.kernel.org/r/20260403-cpuidle_ipi-v2-1-b3e44b032e2c@oss.qualcomm.com Signed-off-by: Maulik Shah Signed-off-by: Sneh Mankad --- drivers/cpuidle/cpuidle.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8950796a493de..ed5f13573eb1b 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -222,6 +222,9 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev, bool broadcast = !!(target_state->flags & CPUIDLE_FLAG_TIMER_STOP); ktime_t time_start, time_end; + if (cpus_peek_for_pending_ipi(cpumask_of(dev->cpu))) + return -EBUSY; + instrumentation_begin(); /*