The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c9723750a699c3bd465493ac2be8992b72ccb105 Mon Sep 17 00:00:00 2001
From: Martin Fuzzey <martin.fuzzey(a)flowbird.group>
Date: Wed, 30 Sep 2020 10:36:46 +0200
Subject: [PATCH] w1: mxc_w1: Fix timeout resolution problem leading to bus
error
On my platform (i.MX53) bus access sometimes fails with
w1_search: max_slave_count 64 reached, will continue next search.
The reason is the use of jiffies to implement a 200us timeout in
mxc_w1_ds2_touch_bit().
On some platforms the jiffies timer resolution is insufficient for this.
Fix by replacing jiffies by ktime_get().
For consistency apply the same change to the other use of jiffies in
mxc_w1_ds2_reset_bus().
Fixes: f80b2581a706 ("w1: mxc_w1: Optimize mxc_w1_ds2_touch_bit()")
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Martin Fuzzey <martin.fuzzey(a)flowbird.group>
Link: https://lore.kernel.org/r/1601455030-6607-1-git-send-email-martin.fuzzey@fl…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/w1/masters/mxc_w1.c b/drivers/w1/masters/mxc_w1.c
index 1ca880e01476..090cbbf9e1e2 100644
--- a/drivers/w1/masters/mxc_w1.c
+++ b/drivers/w1/masters/mxc_w1.c
@@ -7,7 +7,7 @@
#include <linux/clk.h>
#include <linux/delay.h>
#include <linux/io.h>
-#include <linux/jiffies.h>
+#include <linux/ktime.h>
#include <linux/module.h>
#include <linux/mod_devicetable.h>
#include <linux/platform_device.h>
@@ -40,12 +40,12 @@ struct mxc_w1_device {
static u8 mxc_w1_ds2_reset_bus(void *data)
{
struct mxc_w1_device *dev = data;
- unsigned long timeout;
+ ktime_t timeout;
writeb(MXC_W1_CONTROL_RPP, dev->regs + MXC_W1_CONTROL);
/* Wait for reset sequence 511+512us, use 1500us for sure */
- timeout = jiffies + usecs_to_jiffies(1500);
+ timeout = ktime_add_us(ktime_get(), 1500);
udelay(511 + 512);
@@ -55,7 +55,7 @@ static u8 mxc_w1_ds2_reset_bus(void *data)
/* PST bit is valid after the RPP bit is self-cleared */
if (!(ctrl & MXC_W1_CONTROL_RPP))
return !(ctrl & MXC_W1_CONTROL_PST);
- } while (time_is_after_jiffies(timeout));
+ } while (ktime_before(ktime_get(), timeout));
return 1;
}
@@ -68,12 +68,12 @@ static u8 mxc_w1_ds2_reset_bus(void *data)
static u8 mxc_w1_ds2_touch_bit(void *data, u8 bit)
{
struct mxc_w1_device *dev = data;
- unsigned long timeout;
+ ktime_t timeout;
writeb(MXC_W1_CONTROL_WR(bit), dev->regs + MXC_W1_CONTROL);
/* Wait for read/write bit (60us, Max 120us), use 200us for sure */
- timeout = jiffies + usecs_to_jiffies(200);
+ timeout = ktime_add_us(ktime_get(), 200);
udelay(60);
@@ -83,7 +83,7 @@ static u8 mxc_w1_ds2_touch_bit(void *data, u8 bit)
/* RDST bit is valid after the WR1/RD bit is self-cleared */
if (!(ctrl & MXC_W1_CONTROL_WR(bit)))
return !!(ctrl & MXC_W1_CONTROL_RDST);
- } while (time_is_after_jiffies(timeout));
+ } while (ktime_before(ktime_get(), timeout));
return 0;
}
Making perf with gcc-9.1.1 generates the following warning:
CC ui/browsers/hists.o
ui/browsers/hists.c: In function 'perf_evsel__hists_browse':
ui/browsers/hists.c:3078:61: error: '%d' directive output may be \
truncated writing between 1 and 11 bytes into a region of size \
between 2 and 12 [-Werror=format-truncation=]
3078 | "Max event group index to sort is %d (index from 0 to %d)",
| ^~
ui/browsers/hists.c:3078:7: note: directive argument in the range [-2147483648, 8]
3078 | "Max event group index to sort is %d (index from 0 to %d)",
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /usr/include/stdio.h:937,
from ui/browsers/hists.c:5:
IOW, the string in line 3078 might be too long for buf[] of 64 bytes.
Fix this by increasing the size of buf[] to 128.
Fixes: dbddf1747441 ("perf report/top TUI: Support hotkeys to let user select any event for sorting")
Cc: stable <stable(a)vger.kernel.org> # v5.7+
Cc: Jin Yao <yao.jin(a)linux.intel.com>
Cc: Jiri Olsa <jolsa(a)kernel.org>
Cc: Arnaldo Carvalho de Melo <acme(a)redhat.com>
Cc: Arnaldo Carvalho de Melo <acme(a)kernel.org>
Signed-off-by: Song Liu <songliubraving(a)fb.com>
---
tools/perf/ui/browsers/hists.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index a07626f072087..b0e1880cf992b 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2963,7 +2963,7 @@ static int perf_evsel__hists_browse(struct evsel *evsel, int nr_events,
struct popup_action actions[MAX_OPTIONS];
int nr_options = 0;
int key = -1;
- char buf[64];
+ char buf[128];
int delay_secs = hbt ? hbt->refresh : 0;
#define HIST_BROWSER_HELP_COMMON \
--
2.24.1
We've fixed many races in panfrost_job_timedout() but some remain.
Instead of trying to fix it again, let's simplify the logic and move
the reset bits to a separate work scheduled when one of the queue
reports a timeout.
v3:
- Replace the atomic_cmpxchg() by an atomic_xchg() (Robin Murphy)
- Add Steven's R-b
v2:
- Use atomic_cmpxchg() to conditionally schedule the reset work (Steven Price)
Fixes: 1a11a88cfd9a ("drm/panfrost: Fix job timeout handling")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
Reviewed-by: Steven Price <steven.price(a)arm.com>
---
drivers/gpu/drm/panfrost/panfrost_device.c | 1 -
drivers/gpu/drm/panfrost/panfrost_device.h | 6 +-
drivers/gpu/drm/panfrost/panfrost_job.c | 127 ++++++++++++---------
3 files changed, 79 insertions(+), 55 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index ea8d31863c50..a83b2ff5837a 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -200,7 +200,6 @@ int panfrost_device_init(struct panfrost_device *pfdev)
struct resource *res;
mutex_init(&pfdev->sched_lock);
- mutex_init(&pfdev->reset_lock);
INIT_LIST_HEAD(&pfdev->scheduled_jobs);
INIT_LIST_HEAD(&pfdev->as_lru_list);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index 140e004a3790..597cf1459b0a 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -106,7 +106,11 @@ struct panfrost_device {
struct panfrost_perfcnt *perfcnt;
struct mutex sched_lock;
- struct mutex reset_lock;
+
+ struct {
+ struct work_struct work;
+ atomic_t pending;
+ } reset;
struct mutex shrinker_lock;
struct list_head shrinker_list;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 4902bc6624c8..9691d6248f6d 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -20,6 +20,8 @@
#include "panfrost_gpu.h"
#include "panfrost_mmu.h"
+#define JOB_TIMEOUT_MS 500
+
#define job_write(dev, reg, data) writel(data, dev->iomem + (reg))
#define job_read(dev, reg) readl(dev->iomem + (reg))
@@ -382,19 +384,37 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
drm_sched_increase_karma(bad);
queue->stopped = true;
stopped = true;
+
+ /*
+ * Set the timeout to max so the timer doesn't get started
+ * when we return from the timeout handler (restored in
+ * panfrost_scheduler_start()).
+ */
+ queue->sched.timeout = MAX_SCHEDULE_TIMEOUT;
}
mutex_unlock(&queue->lock);
return stopped;
}
+static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
+{
+ if (WARN_ON(!queue->stopped))
+ return;
+
+ mutex_lock(&queue->lock);
+ /* Restore the original timeout before starting the scheduler. */
+ queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
+ drm_sched_start(&queue->sched, true);
+ queue->stopped = false;
+ mutex_unlock(&queue->lock);
+}
+
static void panfrost_job_timedout(struct drm_sched_job *sched_job)
{
struct panfrost_job *job = to_panfrost_job(sched_job);
struct panfrost_device *pfdev = job->pfdev;
int js = panfrost_job_get_slot(job);
- unsigned long flags;
- int i;
/*
* If the GPU managed to complete this jobs fence, the timeout is
@@ -415,56 +435,9 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
return;
- if (!mutex_trylock(&pfdev->reset_lock))
- return;
-
- for (i = 0; i < NUM_JOB_SLOTS; i++) {
- struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
-
- /*
- * If the queue is still active, make sure we wait for any
- * pending timeouts.
- */
- if (!pfdev->js->queue[i].stopped)
- cancel_delayed_work_sync(&sched->work_tdr);
-
- /*
- * If the scheduler was not already stopped, there's a tiny
- * chance a timeout has expired just before we stopped it, and
- * drm_sched_stop() does not flush pending works. Let's flush
- * them now so the timeout handler doesn't get called in the
- * middle of a reset.
- */
- if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
- cancel_delayed_work_sync(&sched->work_tdr);
-
- /*
- * Now that we cancelled the pending timeouts, we can safely
- * reset the stopped state.
- */
- pfdev->js->queue[i].stopped = false;
- }
-
- spin_lock_irqsave(&pfdev->js->job_lock, flags);
- for (i = 0; i < NUM_JOB_SLOTS; i++) {
- if (pfdev->jobs[i]) {
- pm_runtime_put_noidle(pfdev->dev);
- panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
- pfdev->jobs[i] = NULL;
- }
- }
- spin_unlock_irqrestore(&pfdev->js->job_lock, flags);
-
- panfrost_device_reset(pfdev);
-
- for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
-
- mutex_unlock(&pfdev->reset_lock);
-
- /* restart scheduler after GPU is usable again */
- for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_start(&pfdev->js->queue[i].sched, true);
+ /* Schedule a reset if there's no reset in progress. */
+ if (!atomic_xchg(&pfdev->reset.pending, 1))
+ schedule_work(&pfdev->reset.work);
}
static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -531,11 +504,59 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
return IRQ_HANDLED;
}
+static void panfrost_reset(struct work_struct *work)
+{
+ struct panfrost_device *pfdev = container_of(work,
+ struct panfrost_device,
+ reset.work);
+ unsigned long flags;
+ unsigned int i;
+
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ /*
+ * We want pending timeouts to be handled before we attempt
+ * to stop the scheduler. If we don't do that and the timeout
+ * handler is in flight, it might have removed the bad job
+ * from the list, and we'll lose this job if the reset handler
+ * enters the critical section in panfrost_scheduler_stop()
+ * before the timeout handler.
+ *
+ * Timeout is set to max to make sure the timer is not
+ * restarted after the cancellation.
+ */
+ pfdev->js->queue[i].sched.timeout = MAX_SCHEDULE_TIMEOUT;
+ cancel_delayed_work_sync(&pfdev->js->queue[i].sched.work_tdr);
+ panfrost_scheduler_stop(&pfdev->js->queue[i], NULL);
+ }
+
+ /* All timers have been stopped, we can safely reset the pending state. */
+ atomic_set(&pfdev->reset.pending, 0);
+
+ spin_lock_irqsave(&pfdev->js->job_lock, flags);
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ if (pfdev->jobs[i]) {
+ pm_runtime_put_noidle(pfdev->dev);
+ panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
+ pfdev->jobs[i] = NULL;
+ }
+ }
+ spin_unlock_irqrestore(&pfdev->js->job_lock, flags);
+
+ panfrost_device_reset(pfdev);
+
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
+ panfrost_scheduler_start(&pfdev->js->queue[i]);
+ }
+}
+
int panfrost_job_init(struct panfrost_device *pfdev)
{
struct panfrost_job_slot *js;
int ret, j, irq;
+ INIT_WORK(&pfdev->reset.work, panfrost_reset);
+
pfdev->js = js = devm_kzalloc(pfdev->dev, sizeof(*js), GFP_KERNEL);
if (!js)
return -ENOMEM;
@@ -560,7 +581,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
ret = drm_sched_init(&js->queue[j].sched,
&panfrost_sched_ops,
- 1, 0, msecs_to_jiffies(500),
+ 1, 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
"pan_js");
if (ret) {
dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
--
2.26.2
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9a2e849fb6de471b82d19989a7944d3b7671793c Mon Sep 17 00:00:00 2001
From: Hanjun Guo <guohanjun(a)huawei.com>
Date: Fri, 18 Sep 2020 17:13:28 +0800
Subject: [PATCH] ACPI: configfs: Add missing config_item_put() to fix refcount
leak
config_item_put() should be called in the drop_item callback, to
decrement refcount for the config item.
Fixes: 772bf1e2878ec ("ACPI: configfs: Unload SSDT on configfs entry removal")
Signed-off-by: Hanjun Guo <guohanjun(a)huawei.com>
[ rjw: Subject edit ]
Cc: 4.13+ <stable(a)vger.kernel.org> # 4.13+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/acpi_configfs.c b/drivers/acpi/acpi_configfs.c
index 88c8af455ea3..cf91f49101ea 100644
--- a/drivers/acpi/acpi_configfs.c
+++ b/drivers/acpi/acpi_configfs.c
@@ -228,6 +228,7 @@ static void acpi_table_drop_item(struct config_group *group,
ACPI_INFO(("Host-directed Dynamic ACPI Table Unload"));
acpi_unload_table(table->index);
+ config_item_put(cfg);
}
static struct configfs_group_operations acpi_table_group_ops = {
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9a2e849fb6de471b82d19989a7944d3b7671793c Mon Sep 17 00:00:00 2001
From: Hanjun Guo <guohanjun(a)huawei.com>
Date: Fri, 18 Sep 2020 17:13:28 +0800
Subject: [PATCH] ACPI: configfs: Add missing config_item_put() to fix refcount
leak
config_item_put() should be called in the drop_item callback, to
decrement refcount for the config item.
Fixes: 772bf1e2878ec ("ACPI: configfs: Unload SSDT on configfs entry removal")
Signed-off-by: Hanjun Guo <guohanjun(a)huawei.com>
[ rjw: Subject edit ]
Cc: 4.13+ <stable(a)vger.kernel.org> # 4.13+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/acpi_configfs.c b/drivers/acpi/acpi_configfs.c
index 88c8af455ea3..cf91f49101ea 100644
--- a/drivers/acpi/acpi_configfs.c
+++ b/drivers/acpi/acpi_configfs.c
@@ -228,6 +228,7 @@ static void acpi_table_drop_item(struct config_group *group,
ACPI_INFO(("Host-directed Dynamic ACPI Table Unload"));
acpi_unload_table(table->index);
+ config_item_put(cfg);
}
static struct configfs_group_operations acpi_table_group_ops = {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9a2e849fb6de471b82d19989a7944d3b7671793c Mon Sep 17 00:00:00 2001
From: Hanjun Guo <guohanjun(a)huawei.com>
Date: Fri, 18 Sep 2020 17:13:28 +0800
Subject: [PATCH] ACPI: configfs: Add missing config_item_put() to fix refcount
leak
config_item_put() should be called in the drop_item callback, to
decrement refcount for the config item.
Fixes: 772bf1e2878ec ("ACPI: configfs: Unload SSDT on configfs entry removal")
Signed-off-by: Hanjun Guo <guohanjun(a)huawei.com>
[ rjw: Subject edit ]
Cc: 4.13+ <stable(a)vger.kernel.org> # 4.13+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/acpi_configfs.c b/drivers/acpi/acpi_configfs.c
index 88c8af455ea3..cf91f49101ea 100644
--- a/drivers/acpi/acpi_configfs.c
+++ b/drivers/acpi/acpi_configfs.c
@@ -228,6 +228,7 @@ static void acpi_table_drop_item(struct config_group *group,
ACPI_INFO(("Host-directed Dynamic ACPI Table Unload"));
acpi_unload_table(table->index);
+ config_item_put(cfg);
}
static struct configfs_group_operations acpi_table_group_ops = {
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c8fe99d0701fec9fb849ec880a86bc5592530496 Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips(a)amd.com>
Date: Tue, 8 Sep 2020 16:47:34 -0500
Subject: [PATCH] perf/amd/uncore: Set all slices and threads to restore perf
stat -a behaviour
Commit 2f217d58a8a0 ("perf/x86/amd/uncore: Set the thread mask for
F17h L3 PMCs") inadvertently changed the uncore driver's behaviour
wrt perf tool invocations with or without a CPU list, specified with
-C / --cpu=.
Change the behaviour of the driver to assume the former all-cpu (-a)
case, which is the more commonly desired default. This fixes
'-a -A' invocations without explicit cpu lists (-C) to not count
L3 events only on behalf of the first thread of the first core
in the L3 domain.
BEFORE:
Activity performed by the first thread of the last core (CPU#43) in
CPU#40's L3 domain is not reported by CPU#40:
sudo perf stat -a -A -e l3_request_g1.caching_l3_cache_accesses taskset -c 43 perf bench mem memcpy -s 32mb -l 100 -f default
...
CPU36 21,835 l3_request_g1.caching_l3_cache_accesses
CPU40 87,066 l3_request_g1.caching_l3_cache_accesses
CPU44 17,360 l3_request_g1.caching_l3_cache_accesses
...
AFTER:
The L3 domain activity is now reported by CPU#40:
sudo perf stat -a -A -e l3_request_g1.caching_l3_cache_accesses taskset -c 43 perf bench mem memcpy -s 32mb -l 100 -f default
...
CPU36 354,891 l3_request_g1.caching_l3_cache_accesses
CPU40 1,780,870 l3_request_g1.caching_l3_cache_accesses
CPU44 315,062 l3_request_g1.caching_l3_cache_accesses
...
Fixes: 2f217d58a8a0 ("perf/x86/amd/uncore: Set the thread mask for F17h L3 PMCs")
Signed-off-by: Kim Phillips <kim.phillips(a)amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20200908214740.18097-2-kim.phillips@amd.com
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 76400c052b0e..e7e61c8b56bd 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -181,28 +181,16 @@ static void amd_uncore_del(struct perf_event *event, int flags)
}
/*
- * Convert logical CPU number to L3 PMC Config ThreadMask format
+ * Return a full thread and slice mask until per-CPU is
+ * properly supported.
*/
-static u64 l3_thread_slice_mask(int cpu)
+static u64 l3_thread_slice_mask(void)
{
- u64 thread_mask, core = topology_core_id(cpu);
- unsigned int shift, thread = 0;
+ if (boot_cpu_data.x86 <= 0x18)
+ return AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK;
- if (topology_smt_supported() && !topology_is_primary_thread(cpu))
- thread = 1;
-
- if (boot_cpu_data.x86 <= 0x18) {
- shift = AMD64_L3_THREAD_SHIFT + 2 * (core % 4) + thread;
- thread_mask = BIT_ULL(shift);
-
- return AMD64_L3_SLICE_MASK | thread_mask;
- }
-
- core = (core << AMD64_L3_COREID_SHIFT) & AMD64_L3_COREID_MASK;
- shift = AMD64_L3_THREAD_SHIFT + thread;
- thread_mask = BIT_ULL(shift);
-
- return AMD64_L3_EN_ALL_SLICES | core | thread_mask;
+ return AMD64_L3_EN_ALL_SLICES | AMD64_L3_EN_ALL_CORES |
+ AMD64_L3_F19H_THREAD_MASK;
}
static int amd_uncore_event_init(struct perf_event *event)
@@ -232,7 +220,7 @@ static int amd_uncore_event_init(struct perf_event *event)
* For other events, the two fields do not affect the count.
*/
if (l3_mask && is_llc_event(event))
- hwc->config |= l3_thread_slice_mask(event->cpu);
+ hwc->config |= l3_thread_slice_mask();
uncore = event_to_amd_uncore(event);
if (!uncore)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c8fe99d0701fec9fb849ec880a86bc5592530496 Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips(a)amd.com>
Date: Tue, 8 Sep 2020 16:47:34 -0500
Subject: [PATCH] perf/amd/uncore: Set all slices and threads to restore perf
stat -a behaviour
Commit 2f217d58a8a0 ("perf/x86/amd/uncore: Set the thread mask for
F17h L3 PMCs") inadvertently changed the uncore driver's behaviour
wrt perf tool invocations with or without a CPU list, specified with
-C / --cpu=.
Change the behaviour of the driver to assume the former all-cpu (-a)
case, which is the more commonly desired default. This fixes
'-a -A' invocations without explicit cpu lists (-C) to not count
L3 events only on behalf of the first thread of the first core
in the L3 domain.
BEFORE:
Activity performed by the first thread of the last core (CPU#43) in
CPU#40's L3 domain is not reported by CPU#40:
sudo perf stat -a -A -e l3_request_g1.caching_l3_cache_accesses taskset -c 43 perf bench mem memcpy -s 32mb -l 100 -f default
...
CPU36 21,835 l3_request_g1.caching_l3_cache_accesses
CPU40 87,066 l3_request_g1.caching_l3_cache_accesses
CPU44 17,360 l3_request_g1.caching_l3_cache_accesses
...
AFTER:
The L3 domain activity is now reported by CPU#40:
sudo perf stat -a -A -e l3_request_g1.caching_l3_cache_accesses taskset -c 43 perf bench mem memcpy -s 32mb -l 100 -f default
...
CPU36 354,891 l3_request_g1.caching_l3_cache_accesses
CPU40 1,780,870 l3_request_g1.caching_l3_cache_accesses
CPU44 315,062 l3_request_g1.caching_l3_cache_accesses
...
Fixes: 2f217d58a8a0 ("perf/x86/amd/uncore: Set the thread mask for F17h L3 PMCs")
Signed-off-by: Kim Phillips <kim.phillips(a)amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20200908214740.18097-2-kim.phillips@amd.com
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 76400c052b0e..e7e61c8b56bd 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -181,28 +181,16 @@ static void amd_uncore_del(struct perf_event *event, int flags)
}
/*
- * Convert logical CPU number to L3 PMC Config ThreadMask format
+ * Return a full thread and slice mask until per-CPU is
+ * properly supported.
*/
-static u64 l3_thread_slice_mask(int cpu)
+static u64 l3_thread_slice_mask(void)
{
- u64 thread_mask, core = topology_core_id(cpu);
- unsigned int shift, thread = 0;
+ if (boot_cpu_data.x86 <= 0x18)
+ return AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK;
- if (topology_smt_supported() && !topology_is_primary_thread(cpu))
- thread = 1;
-
- if (boot_cpu_data.x86 <= 0x18) {
- shift = AMD64_L3_THREAD_SHIFT + 2 * (core % 4) + thread;
- thread_mask = BIT_ULL(shift);
-
- return AMD64_L3_SLICE_MASK | thread_mask;
- }
-
- core = (core << AMD64_L3_COREID_SHIFT) & AMD64_L3_COREID_MASK;
- shift = AMD64_L3_THREAD_SHIFT + thread;
- thread_mask = BIT_ULL(shift);
-
- return AMD64_L3_EN_ALL_SLICES | core | thread_mask;
+ return AMD64_L3_EN_ALL_SLICES | AMD64_L3_EN_ALL_CORES |
+ AMD64_L3_F19H_THREAD_MASK;
}
static int amd_uncore_event_init(struct perf_event *event)
@@ -232,7 +220,7 @@ static int amd_uncore_event_init(struct perf_event *event)
* For other events, the two fields do not affect the count.
*/
if (l3_mask && is_llc_event(event))
- hwc->config |= l3_thread_slice_mask(event->cpu);
+ hwc->config |= l3_thread_slice_mask();
uncore = event_to_amd_uncore(event);
if (!uncore)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c8fe99d0701fec9fb849ec880a86bc5592530496 Mon Sep 17 00:00:00 2001
From: Kim Phillips <kim.phillips(a)amd.com>
Date: Tue, 8 Sep 2020 16:47:34 -0500
Subject: [PATCH] perf/amd/uncore: Set all slices and threads to restore perf
stat -a behaviour
Commit 2f217d58a8a0 ("perf/x86/amd/uncore: Set the thread mask for
F17h L3 PMCs") inadvertently changed the uncore driver's behaviour
wrt perf tool invocations with or without a CPU list, specified with
-C / --cpu=.
Change the behaviour of the driver to assume the former all-cpu (-a)
case, which is the more commonly desired default. This fixes
'-a -A' invocations without explicit cpu lists (-C) to not count
L3 events only on behalf of the first thread of the first core
in the L3 domain.
BEFORE:
Activity performed by the first thread of the last core (CPU#43) in
CPU#40's L3 domain is not reported by CPU#40:
sudo perf stat -a -A -e l3_request_g1.caching_l3_cache_accesses taskset -c 43 perf bench mem memcpy -s 32mb -l 100 -f default
...
CPU36 21,835 l3_request_g1.caching_l3_cache_accesses
CPU40 87,066 l3_request_g1.caching_l3_cache_accesses
CPU44 17,360 l3_request_g1.caching_l3_cache_accesses
...
AFTER:
The L3 domain activity is now reported by CPU#40:
sudo perf stat -a -A -e l3_request_g1.caching_l3_cache_accesses taskset -c 43 perf bench mem memcpy -s 32mb -l 100 -f default
...
CPU36 354,891 l3_request_g1.caching_l3_cache_accesses
CPU40 1,780,870 l3_request_g1.caching_l3_cache_accesses
CPU44 315,062 l3_request_g1.caching_l3_cache_accesses
...
Fixes: 2f217d58a8a0 ("perf/x86/amd/uncore: Set the thread mask for F17h L3 PMCs")
Signed-off-by: Kim Phillips <kim.phillips(a)amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20200908214740.18097-2-kim.phillips@amd.com
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 76400c052b0e..e7e61c8b56bd 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -181,28 +181,16 @@ static void amd_uncore_del(struct perf_event *event, int flags)
}
/*
- * Convert logical CPU number to L3 PMC Config ThreadMask format
+ * Return a full thread and slice mask until per-CPU is
+ * properly supported.
*/
-static u64 l3_thread_slice_mask(int cpu)
+static u64 l3_thread_slice_mask(void)
{
- u64 thread_mask, core = topology_core_id(cpu);
- unsigned int shift, thread = 0;
+ if (boot_cpu_data.x86 <= 0x18)
+ return AMD64_L3_SLICE_MASK | AMD64_L3_THREAD_MASK;
- if (topology_smt_supported() && !topology_is_primary_thread(cpu))
- thread = 1;
-
- if (boot_cpu_data.x86 <= 0x18) {
- shift = AMD64_L3_THREAD_SHIFT + 2 * (core % 4) + thread;
- thread_mask = BIT_ULL(shift);
-
- return AMD64_L3_SLICE_MASK | thread_mask;
- }
-
- core = (core << AMD64_L3_COREID_SHIFT) & AMD64_L3_COREID_MASK;
- shift = AMD64_L3_THREAD_SHIFT + thread;
- thread_mask = BIT_ULL(shift);
-
- return AMD64_L3_EN_ALL_SLICES | core | thread_mask;
+ return AMD64_L3_EN_ALL_SLICES | AMD64_L3_EN_ALL_CORES |
+ AMD64_L3_F19H_THREAD_MASK;
}
static int amd_uncore_event_init(struct perf_event *event)
@@ -232,7 +220,7 @@ static int amd_uncore_event_init(struct perf_event *event)
* For other events, the two fields do not affect the count.
*/
if (l3_mask && is_llc_event(event))
- hwc->config |= l3_thread_slice_mask(event->cpu);
+ hwc->config |= l3_thread_slice_mask();
uncore = event_to_amd_uncore(event);
if (!uncore)