The patch titled
Subject: slub: track number of slabs irrespective of CONFIG_SLUB_DEBUG
has been removed from the -mm tree. Its filename was
slub-track-number-of-slabs-irrespective-of-config_slub_debug.patch
This patch was dropped because it is obsolete
------------------------------------------------------
From: Shakeel Butt <shakeelb(a)google.com>
Subject: slub: track number of slabs irrespective of CONFIG_SLUB_DEBUG
For !CONFIG_SLUB_DEBUG, SLUB does not maintain the number of slabs
allocated per node for a kmem_cache. Thus, slabs_node() in
__kmem_cache_empty(), __kmem_cache_shrink() and __kmem_cache_destroy()
will always return 0 for such config. This is wrong and can cause issues
for all users of these functions.
In fact in [1] Jason has reported a system crash while using SLUB without
CONFIG_SLUB_DEBUG. The reason was the usage of slabs_node() by
__kmem_cache_empty().
The right solution is to make slabs_node() work even for
!CONFIG_SLUB_DEBUG. The commit 0f389ec63077 ("slub: No need for per node
slab counters if !SLUB_DEBUG") had put the per node slab counter under
CONFIG_SLUB_DEBUG because it was only read through sysfs API and the sysfs
API was disabled on !CONFIG_SLUB_DEBUG. However the users of the per node
slab counter assumed that it will work in the absence of
CONFIG_SLUB_DEBUG. So, make the counter work for !CONFIG_SLUB_DEBUG.
Please note that f9e13c0a5a33 ("slab, slub: skip unnecessary
kasan_cache_shutdown()") exposed this issue but it is present even before.
[1] http://lkml.kernel.org/r/CAHmME9rtoPwxUSnktxzKso14iuVCWT7BE_-_8PAC=pGw1iJnQ…
Link: http://lkml.kernel.org/r/20180620224147.23777-1-shakeelb@google.com
Fixes: f9e13c0a5a33 ("slab, slub: skip unnecessary kasan_cache_shutdown()")
Signed-off-by: Shakeel Butt <shakeelb(a)google.com>
Suggested-by: David Rientjes <rientjes(a)google.com>
Reported-by: Jason A . Donenfeld <Jason(a)zx2c4.com>
Cc: Christoph Lameter <cl(a)linux.com>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/slab.h | 2 -
mm/slub.c | 80 ++++++++++++++++++++++++----------------------------
2 files changed, 38 insertions(+), 44 deletions(-)
diff -puN mm/slab.h~slub-track-number-of-slabs-irrespective-of-config_slub_debug mm/slab.h
--- a/mm/slab.h~slub-track-number-of-slabs-irrespective-of-config_slub_debug
+++ a/mm/slab.h
@@ -473,8 +473,8 @@ struct kmem_cache_node {
#ifdef CONFIG_SLUB
unsigned long nr_partial;
struct list_head partial;
-#ifdef CONFIG_SLUB_DEBUG
atomic_long_t nr_slabs;
+#ifdef CONFIG_SLUB_DEBUG
atomic_long_t total_objects;
struct list_head full;
#endif
diff -puN mm/slub.c~slub-track-number-of-slabs-irrespective-of-config_slub_debug mm/slub.c
--- a/mm/slub.c~slub-track-number-of-slabs-irrespective-of-config_slub_debug
+++ a/mm/slub.c
@@ -1030,42 +1030,6 @@ static void remove_full(struct kmem_cach
list_del(&page->lru);
}
-/* Tracking of the number of slabs for debugging purposes */
-static inline unsigned long slabs_node(struct kmem_cache *s, int node)
-{
- struct kmem_cache_node *n = get_node(s, node);
-
- return atomic_long_read(&n->nr_slabs);
-}
-
-static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
-{
- return atomic_long_read(&n->nr_slabs);
-}
-
-static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
-{
- struct kmem_cache_node *n = get_node(s, node);
-
- /*
- * May be called early in order to allocate a slab for the
- * kmem_cache_node structure. Solve the chicken-egg
- * dilemma by deferring the increment of the count during
- * bootstrap (see early_kmem_cache_node_alloc).
- */
- if (likely(n)) {
- atomic_long_inc(&n->nr_slabs);
- atomic_long_add(objects, &n->total_objects);
- }
-}
-static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
-{
- struct kmem_cache_node *n = get_node(s, node);
-
- atomic_long_dec(&n->nr_slabs);
- atomic_long_sub(objects, &n->total_objects);
-}
-
/* Object debug checks for alloc/free paths */
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
@@ -1321,16 +1285,46 @@ slab_flags_t kmem_cache_flags(unsigned i
#define disable_higher_order_debug 0
+#endif /* CONFIG_SLUB_DEBUG */
+
static inline unsigned long slabs_node(struct kmem_cache *s, int node)
- { return 0; }
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ return atomic_long_read(&n->nr_slabs);
+}
+
static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
- { return 0; }
-static inline void inc_slabs_node(struct kmem_cache *s, int node,
- int objects) {}
-static inline void dec_slabs_node(struct kmem_cache *s, int node,
- int objects) {}
+{
+ return atomic_long_read(&n->nr_slabs);
+}
-#endif /* CONFIG_SLUB_DEBUG */
+static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ /*
+ * May be called early in order to allocate a slab for the
+ * kmem_cache_node structure. Solve the chicken-egg
+ * dilemma by deferring the increment of the count during
+ * bootstrap (see early_kmem_cache_node_alloc).
+ */
+ if (likely(n)) {
+ atomic_long_inc(&n->nr_slabs);
+#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_add(objects, &n->total_objects);
+#endif
+ }
+}
+static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
+{
+ struct kmem_cache_node *n = get_node(s, node);
+
+ atomic_long_dec(&n->nr_slabs);
+#ifdef CONFIG_SLUB_DEBUG
+ atomic_long_sub(objects, &n->total_objects);
+#endif
+}
/*
* Hooks for other subsystems that check memory allocations. In a typical
_
Patches currently in -mm which might be from shakeelb(a)google.com are
kvm-mm-account-shadow-page-tables-to-kmemcg.patch
fs-fsnotify-account-fsnotify-metadata-to-kmemcg.patch
fs-fsnotify-account-fsnotify-metadata-to-kmemcg-fix.patch
fs-mm-account-buffer_head-to-kmemcg.patch
fs-mm-account-buffer_head-to-kmemcgpatchfix.patch
memcg-reduce-memcg-tree-traversals-for-stats-collection.patch
From: Snild Dolkow <snild(a)sony.com>
There is a window for racing when printing directly to task->comm,
allowing other threads to see a non-terminated string. The vsnprintf
function fills the buffer, counts the truncated chars, then finally
writes the \0 at the end.
creator other
vsnprintf:
fill (not terminated)
count the rest trace_sched_waking(p):
... memcpy(comm, p->comm, TASK_COMM_LEN)
write \0
The consequences depend on how 'other' uses the string. In our case,
it was copied into the tracing system's saved cmdlines, a buffer of
adjacent TASK_COMM_LEN-byte buffers (note the 'n' where 0 should be):
crash-arm64> x/1024s savedcmd->saved_cmdlines | grep 'evenk'
0xffffffd5b3818640: "irq/497-pwr_evenkworker/u16:12"
...and a strcpy out of there would cause stack corruption:
[224761.522292] Kernel panic - not syncing: stack-protector:
Kernel stack is corrupted in: ffffff9bf9783c78
crash-arm64> kbt | grep 'comm\|trace_print_context'
#6 0xffffff9bf9783c78 in trace_print_context+0x18c(+396)
comm (char [16]) = "irq/497-pwr_even"
crash-arm64> rd 0xffffffd4d0e17d14 8
ffffffd4d0e17d14: 2f71726900000000 5f7277702d373934 ....irq/497-pwr_
ffffffd4d0e17d24: 726f776b6e657665 3a3631752f72656b evenkworker/u16:
ffffffd4d0e17d34: f9780248ff003231 cede60e0ffffff9b 12..H.x......`..
ffffffd4d0e17d44: cede60c8ffffffd4 00000fffffffffd4 .....`..........
The workaround in e09e28671 (use strlcpy in __trace_find_cmdline) was
likely needed because of this same bug.
Solved by vsnprintf:ing to a local buffer, then using set_task_comm().
This way, there won't be a window where comm is not terminated.
Link: http://lkml.kernel.org/r/20180726071539.188015-1-snild@sony.com
Cc: stable(a)vger.kernel.org
Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure")
Reviewed-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
Signed-off-by: Snild Dolkow <snild(a)sony.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/kthread.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 750cb8082694..486dedbd9af5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,8 +325,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
task = create->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
+ char name[TASK_COMM_LEN];
- vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+ /*
+ * task is already visible to other tasks, so updating
+ * COMM must be protected.
+ */
+ vsnprintf(name, sizeof(name), namefmt, args);
+ set_task_comm(task, name);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
--
2.17.1
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
Commit 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on
enable_trace_kprobe() failure") added an if statement that depends on another
if statement that gcc doesn't see will initialize the "link" variable and
gives the warning:
"warning: 'link' may be used uninitialized in this function"
It is really a false positive, but to quiet the warning, and also to make
sure that it never actually is used uninitialized, initialize the "link"
variable to NULL and add an if (!WARN_ON_ONCE(!link)) where the compiler
thinks it could be used uninitialized.
Cc: stable(a)vger.kernel.org
Fixes: 57ea2a34adf4 ("tracing/kprobes: Fix trace_probe flags on enable_trace_kprobe() failure")
Reported-by: kbuild test robot <lkp(a)intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/trace/trace_kprobe.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27ace4513c43..6b71860f3998 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -400,7 +400,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
static int
enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
{
- struct event_file_link *link;
+ struct event_file_link *link = NULL;
int ret = 0;
if (file) {
@@ -426,7 +426,9 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct trace_event_file *file)
if (ret) {
if (file) {
- list_del_rcu(&link->list);
+ /* Notice the if is true on not WARN() */
+ if (!WARN_ON_ONCE(!link))
+ list_del_rcu(&link->list);
kfree(link);
tk->tp.flags &= ~TP_FLAG_TRACE;
} else {
--
2.17.1
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
There was a case that triggered a double free in event_trigger_callback()
due to the called reg() function freeing the trigger_data and then it
getting freed again by the error return by the caller. The solution there
was to up the trigger_data ref count.
Code inspection found that event_enable_trigger_func() has the same issue,
but is not as easy to trigger (requires harder to trigger failures). It
needs to be solved slightly different as it needs more to clean up when the
reg() function fails.
Link: http://lkml.kernel.org/r/20180725124008.7008e586@gandalf.local.home
Cc: stable(a)vger.kernel.org
Fixes: 7862ad1846e99 ("tracing: Add 'enable_event' and 'disable_event' event trigger commands")
Reivewed-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
kernel/trace/trace_events_trigger.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d18ec0e58be2..5dea177cef53 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1420,6 +1420,9 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
goto out;
}
+ /* Up the trigger_data count to make sure nothing frees it on failure */
+ event_trigger_init(trigger_ops, trigger_data);
+
if (trigger) {
number = strsep(&trigger, ":");
@@ -1470,6 +1473,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
goto out_disable;
/* Just return zero, not the number of enabled functions */
ret = 0;
+ event_trigger_free(trigger_ops, trigger_data);
out:
return ret;
@@ -1480,7 +1484,7 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
out_free:
if (cmd_ops->set_filter)
cmd_ops->set_filter(NULL, trigger_data, NULL);
- kfree(trigger_data);
+ event_trigger_free(trigger_ops, trigger_data);
kfree(enable_data);
goto out;
}
--
2.17.1
From: Masami Hiramatsu <mhiramat(a)kernel.org>
Maintain the tracing on/off setting of the ring_buffer when switching
to the trace buffer snapshot.
Taking a snapshot is done by swapping the backup ring buffer
(max_tr_buffer). But since the tracing on/off setting is defined
by the ring buffer, when swapping it, the tracing on/off setting
can also be changed. This causes a strange result like below:
/sys/kernel/debug/tracing # cat tracing_on
1
/sys/kernel/debug/tracing # echo 0 > tracing_on
/sys/kernel/debug/tracing # cat tracing_on
0
/sys/kernel/debug/tracing # echo 1 > snapshot
/sys/kernel/debug/tracing # cat tracing_on
1
/sys/kernel/debug/tracing # echo 1 > snapshot
/sys/kernel/debug/tracing # cat tracing_on
0
We don't touch tracing_on, but snapshot changes tracing_on
setting each time. This is an anomaly, because user doesn't know
that each "ring_buffer" stores its own tracing-enable state and
the snapshot is done by swapping ring buffers.
Link: http://lkml.kernel.org/r/153149929558.11274.11730609978254724394.stgit@devb…
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Tom Zanussi <tom.zanussi(a)linux.intel.com>
Cc: Hiraku Toyooka <hiraku.toyooka(a)cybertrust.co.jp>
Cc: stable(a)vger.kernel.org
Fixes: debdd57f5145 ("tracing: Make a snapshot feature available from userspace")
Signed-off-by: Masami Hiramatsu <mhiramat(a)kernel.org>
[ Updated commit log and comment in the code ]
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
include/linux/ring_buffer.h | 1 +
kernel/trace/ring_buffer.c | 16 ++++++++++++++++
kernel/trace/trace.c | 6 ++++++
3 files changed, 23 insertions(+)
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b72ebdff0b77..003d09ab308d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -165,6 +165,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer);
void ring_buffer_record_off(struct ring_buffer *buffer);
void ring_buffer_record_on(struct ring_buffer *buffer);
int ring_buffer_record_is_on(struct ring_buffer *buffer);
+int ring_buffer_record_is_set_on(struct ring_buffer *buffer);
void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6a46af21765c..0b0b688ea166 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3226,6 +3226,22 @@ int ring_buffer_record_is_on(struct ring_buffer *buffer)
return !atomic_read(&buffer->record_disabled);
}
+/**
+ * ring_buffer_record_is_set_on - return true if the ring buffer is set writable
+ * @buffer: The ring buffer to see if write is set enabled
+ *
+ * Returns true if the ring buffer is set writable by ring_buffer_record_on().
+ * Note that this does NOT mean it is in a writable state.
+ *
+ * It may return true when the ring buffer has been disabled by
+ * ring_buffer_record_disable(), as that is a temporary disabling of
+ * the ring buffer.
+ */
+int ring_buffer_record_is_set_on(struct ring_buffer *buffer)
+{
+ return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
+}
+
/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 87cf25171fb8..823687997b01 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1373,6 +1373,12 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
arch_spin_lock(&tr->max_lock);
+ /* Inherit the recordable setting from trace_buffer */
+ if (ring_buffer_record_is_set_on(tr->trace_buffer.buffer))
+ ring_buffer_record_on(tr->max_buffer.buffer);
+ else
+ ring_buffer_record_off(tr->max_buffer.buffer);
+
swap(tr->trace_buffer.buffer, tr->max_buffer.buffer);
__update_max_tr(tr, tsk, cpu);
--
2.17.1
The patch below does not apply to the 4.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From eb493fbc150f4a28151ae1ee84f24395989f3600 Mon Sep 17 00:00:00 2001
From: Lyude Paul <lyude(a)redhat.com>
Date: Tue, 3 Jul 2018 16:31:41 -0400
Subject: [PATCH] drm/nouveau: Set DRIVER_ATOMIC cap earlier to fix debugfs
Currently nouveau doesn't actually expose the state debugfs file that's
usually provided for any modesetting driver that supports atomic, even
if nouveau is loaded with atomic=1. This is due to the fact that the
standard debugfs files that DRM creates for atomic drivers is called
when drm_get_pci_dev() is called from nouveau_drm.c. This happens well
before we've initialized the display core, which is currently
responsible for setting the DRIVER_ATOMIC cap.
So, move the atomic option into nouveau_drm.c and just add the
DRIVER_ATOMIC cap whenever it's enabled on the kernel commandline. This
shouldn't cause any actual issues, as the atomic ioctl will still fail
as expected even if the display core doesn't disable it until later in
the init sequence. This also provides the added benefit of being able to
use the state debugfs file to check the current display state even if
clients aren't allowed to modify it through anything other than the
legacy ioctls.
Additionally, disable the DRIVER_ATOMIC cap in nv04's display core, as
this was already disabled there previously.
Signed-off-by: Lyude Paul <lyude(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Ben Skeggs <bskeggs(a)redhat.com>
diff --git a/drivers/gpu/drm/nouveau/dispnv04/disp.c b/drivers/gpu/drm/nouveau/dispnv04/disp.c
index 501d2d290e9c..70dce544984e 100644
--- a/drivers/gpu/drm/nouveau/dispnv04/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv04/disp.c
@@ -55,6 +55,9 @@ nv04_display_create(struct drm_device *dev)
nouveau_display(dev)->init = nv04_display_init;
nouveau_display(dev)->fini = nv04_display_fini;
+ /* Pre-nv50 doesn't support atomic, so don't expose the ioctls */
+ dev->driver->driver_features &= ~DRIVER_ATOMIC;
+
nouveau_hw_save_vga_fonts(dev, 1);
nv04_crtc_create(dev, 0);
diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c
index 31b12b4f321a..9bae4db84cfb 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c
@@ -2126,10 +2126,6 @@ nv50_display_destroy(struct drm_device *dev)
kfree(disp);
}
-MODULE_PARM_DESC(atomic, "Expose atomic ioctl (default: disabled)");
-static int nouveau_atomic = 0;
-module_param_named(atomic, nouveau_atomic, int, 0400);
-
int
nv50_display_create(struct drm_device *dev)
{
@@ -2154,8 +2150,6 @@ nv50_display_create(struct drm_device *dev)
disp->disp = &nouveau_display(dev)->disp;
dev->mode_config.funcs = &nv50_disp_func;
dev->driver->driver_features |= DRIVER_PREFER_XBGR_30BPP;
- if (nouveau_atomic)
- dev->driver->driver_features |= DRIVER_ATOMIC;
/* small shared memory area we use for notifiers and semaphores */
ret = nouveau_bo_new(&drm->client, 4096, 0x1000, TTM_PL_FLAG_VRAM,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 514903338782..f5d3158f0378 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -81,6 +81,10 @@ MODULE_PARM_DESC(modeset, "enable driver (default: auto, "
int nouveau_modeset = -1;
module_param_named(modeset, nouveau_modeset, int, 0400);
+MODULE_PARM_DESC(atomic, "Expose atomic ioctl (default: disabled)");
+static int nouveau_atomic = 0;
+module_param_named(atomic, nouveau_atomic, int, 0400);
+
MODULE_PARM_DESC(runpm, "disable (0), force enable (1), optimus only default (-1)");
static int nouveau_runtime_pm = -1;
module_param_named(runpm, nouveau_runtime_pm, int, 0400);
@@ -509,6 +513,9 @@ static int nouveau_drm_probe(struct pci_dev *pdev,
pci_set_master(pdev);
+ if (nouveau_atomic)
+ driver_pci.driver_features |= DRIVER_ATOMIC;
+
ret = drm_get_pci_dev(pdev, pent, &driver_pci);
if (ret) {
nvkm_device_del(&device);
Hi Stable Team,
On 13/06/2018 14:20, Neil Armstrong wrote:
> On Amlogic Meson GXBB & GXL platforms, the SCPI Cortex-M4 Co-Processor
> seems to be dependent on the FCLK_DIV2 to be operationnal.
>
> The issue occured since v4.17-rc1 by freezing the kernel boot when
> the 'schedutil' cpufreq governor was selected as default :
>
> [ 12.071837] scpi_protocol scpi: SCP Protocol 0.0 Firmware 0.0.0 version
> domain-0 init dvfs: 4
> [ 12.087757] hctosys: unable to open rtc device (rtc0)
> [ 12.087907] cfg80211: Loading compiled-in X.509 certificates for regulatory database
> [ 12.102241] cfg80211: Loaded X.509 cert 'sforshee: 00b28ddf47aef9cea7'
>
> But when disabling the MMC driver, the boot finished but cpufreq failed to
> change the CPU frequency :
>
> [ 12.153045] cpufreq: __target_index: Failed to change cpu frequency: -5
>
> A bisect between v4.16 and v4.16-rc1 gave the 05f814402d61 commit to be
> the first bad commit.
> This commit added support for the missing clock gates before the fixed PLL
> fixed dividers (FCLK_DIVx) and the clock framework basically disabled
> all the unused fixed dividers, thus disabled a critical clock path for
> the SCPI Co-Processor.
>
> This patch simply sets the FCLK_DIV2 gate as critical to ensure
> nobody can disable it.
>
> Fixes: 05f814402d61 ("clk: meson: add fdiv clock gates")
> Signed-off-by: Neil Armstrong <narmstrong(a)baylibre.com>
This patch hit linux master with commit id c987ac6f1f088663b6dad39281071aeb31d450a8
Could this be backported to the next 4.17 stable release ?
Thanks,
Neil
> ---
> drivers/clk/meson/gxbb.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/drivers/clk/meson/gxbb.c b/drivers/clk/meson/gxbb.c
> index b1e4d95..0e053c1 100644
> --- a/drivers/clk/meson/gxbb.c
> +++ b/drivers/clk/meson/gxbb.c
> @@ -511,6 +511,7 @@ static struct clk_regmap gxbb_fclk_div2 = {
> .ops = &clk_regmap_gate_ops,
> .parent_names = (const char *[]){ "fclk_div2_div" },
> .num_parents = 1,
> + .flags = CLK_IS_CRITICAL,
> },
> };
>
>
This is the start of the stable review cycle for the 4.4.139 release.
There are 105 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Tue Jul 3 15:31:30 UTC 2018.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.139-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.4.139-rc1
Szymon Janc <szymon.janc(a)codecoup.pl>
Bluetooth: Fix connection if directed advertising and privacy is used
Bjørn Mork <bjorn(a)mork.no>
cdc_ncm: avoid padding beyond end of skb
Mike Snitzer <snitzer(a)redhat.com>
dm thin: handle running out of data space vs concurrent discard
Keith Busch <keith.busch(a)intel.com>
block: Fix transfer when chunk sectors exceeds max
Maxime Chevallier <maxime.chevallier(a)bootlin.com>
spi: Fix scatterlist elements size in spi_map_buf
Liu Bo <bo.li.liu(a)oracle.com>
Btrfs: fix unexpected cow in run_delalloc_nocow
Takashi Iwai <tiwai(a)suse.de>
ALSA: hda/realtek - Add a quirk for FSC ESPRIMO U9210
??? <kt.liao(a)emc.com.tw>
Input: elantech - fix V4 report decoding for module with middle key
Aaron Ma <aaron.ma(a)canonical.com>
Input: elantech - enable middle button of touchpads on ThinkPad P52
Ben Hutchings <ben.hutchings(a)codethink.co.uk>
Input: elan_i2c_smbus - fix more potential stack buffer overflows
Jan Kara <jack(a)suse.cz>
udf: Detect incorrect directory size
Boris Ostrovsky <boris.ostrovsky(a)oracle.com>
xen: Remove unnecessary BUG_ON from __unbind_from_irq()
Alexandr Savca <alexandr.savca(a)saltedge.com>
Input: elan_i2c - add ELAN0618 (Lenovo v330 15IKB) ACPI ID
Kees Cook <keescook(a)chromium.org>
video: uvesafb: Fix integer overflow in allocation
Dave Wysochanski <dwysocha(a)redhat.com>
NFSv4: Fix possible 1-byte stack overflow in nfs_idmap_read_and_verify_message
Scott Mayhew <smayhew(a)redhat.com>
nfsd: restrict rd_maxcount to svc_max_payload in nfsd_encode_readdir
Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
media: dvb_frontend: fix locking issues at dvb_frontend_get_event()
Kai-Heng Feng <kai.heng.feng(a)canonical.com>
media: cx231xx: Add support for AverMedia DVD EZMaker 7
Mauro Carvalho Chehab <mchehab(a)s-opensource.com>
media: v4l2-compat-ioctl32: prevent go past max size
Adrian Hunter <adrian.hunter(a)intel.com>
perf intel-pt: Fix packet decoding of CYC packets
Adrian Hunter <adrian.hunter(a)intel.com>
perf intel-pt: Fix "Unexpected indirect branch" error
Adrian Hunter <adrian.hunter(a)intel.com>
perf intel-pt: Fix MTC timing after overflow
Adrian Hunter <adrian.hunter(a)intel.com>
perf intel-pt: Fix decoding to accept CBR between FUP and corresponding TIP
Adrian Hunter <adrian.hunter(a)intel.com>
perf intel-pt: Fix sync_switch INTEL_PT_SS_NOT_TRACING
Adrian Hunter <adrian.hunter(a)intel.com>
perf tools: Fix symbol and object code resolution for vdso32 and vdsox32
Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
mfd: intel-lpss: Program REMAP register in PIO mode
Johan Hovold <johan(a)kernel.org>
backlight: tps65217_bl: Fix Device Tree node lookup
Johan Hovold <johan(a)kernel.org>
backlight: max8925_bl: Fix Device Tree node lookup
Johan Hovold <johan(a)kernel.org>
backlight: as3711_bl: Fix Device Tree node lookup
Florian Westphal <fw(a)strlen.de>
xfrm: skip policies marked as dead while rehashing
Tobias Brunner <tobias(a)strongswan.org>
xfrm: Ignore socket policies when rebuilding hash tables
Silvio Cesare <silvio.cesare(a)gmail.com>
UBIFS: Fix potential integer overflow in allocation
Richard Weinberger <richard(a)nod.at>
ubi: fastmap: Cancel work upon detach
NeilBrown <neilb(a)suse.com>
md: fix two problems with setting the "re-add" device state.
Robert Elliott <elliott(a)hpe.com>
linvdimm, pmem: Preserve read-only setting for pmem devices
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: fix missing REC trigger trace on enqueue without ERP thread
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: fix missing REC trigger trace for all objects in ERP_FAILED
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: fix missing REC trigger trace on terminate_rport_io for ERP_FAILED
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: fix missing REC trigger trace on terminate_rport_io early return
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: fix misleading REC trigger trace where erp_action setup failed
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: fix missing SCSI trace for retry of abort / scsi_eh TMF
Steffen Maier <maier(a)linux.ibm.com>
scsi: zfcp: fix missing SCSI trace for result of eh_host_reset_handler
Himanshu Madhani <himanshu.madhani(a)cavium.com>
scsi: qla2xxx: Fix setting lower transfer speed if GPSC fails
Martin Kelly <mkelly(a)xevo.com>
iio:buffer: make length types match kfifo types
Omar Sandoval <osandov(a)fb.com>
Btrfs: fix clone vs chattr NODATASUM race
Geert Uytterhoeven <geert(a)linux-m68k.org>
time: Make sure jiffies_to_msecs() preserves non-zero time periods
Huacai Chen <chenhc(a)lemote.com>
MIPS: io: Add barrier after register read in inX()
Mika Westerberg <mika.westerberg(a)linux.intel.com>
PCI: pciehp: Clear Presence Detect and Data Link Layer Status Changed on resume
Tokunori Ikegami <ikegami(a)allied-telesis.co.jp>
MIPS: BCM47XX: Enable 74K Core ExternalSync for PCIe erratum
Joakim Tjernlund <joakim.tjernlund(a)infinera.com>
mtd: cfi_cmdset_0002: Avoid walking all chips when unlocking.
Joakim Tjernlund <joakim.tjernlund(a)infinera.com>
mtd: cfi_cmdset_0002: Fix unlocking requests crossing a chip boudary
Joakim Tjernlund <joakim.tjernlund(a)infinera.com>
mtd: cfi_cmdset_0002: fix SEGV unlocking multiple chips
Joakim Tjernlund <joakim.tjernlund(a)infinera.com>
mtd: cfi_cmdset_0002: Use right chip in do_ppb_xxlock()
Tokunori Ikegami <ikegami(a)allied-telesis.co.jp>
mtd: cfi_cmdset_0002: Change write buffer to check correct value
Leon Romanovsky <leonro(a)mellanox.com>
RDMA/mlx4: Discard unknown SQP work requests
Mike Marciniszyn <mike.marciniszyn(a)intel.com>
IB/qib: Fix DMA api warning with debug kernel
Stefan M Schaeckeler <sschaeck(a)cisco.com>
of: unittest: for strings, account for trailing \0 in property length field
David Rivshin <DRivshin(a)allworx.com>
ARM: 8764/1: kgdb: fix NUMREGBYTES so that gdb_regs[] is the correct size
Mahesh Salgaonkar <mahesh(a)linux.vnet.ibm.com>
powerpc/fadump: Unregister fadump on kexec down path.
Gautham R. Shenoy <ego(a)linux.vnet.ibm.com>
cpuidle: powernv: Fix promotion from snooze if next state disabled
Michael Neuling <mikey(a)neuling.org>
powerpc/ptrace: Fix enforcement of DAWR constraints
Michael Neuling <mikey(a)neuling.org>
powerpc/ptrace: Fix setting 512B aligned breakpoints with PTRACE_SET_DEBUGREG
Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
powerpc/mm/hash: Add missing isync prior to kernel stack SLB switch
Miklos Szeredi <mszeredi(a)redhat.com>
fuse: fix control dir setup and teardown
Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
fuse: don't keep dead fuse_conn at fuse_fill_super().
Miklos Szeredi <mszeredi(a)redhat.com>
fuse: atomic_o_trunc should truncate pagecache
Amit Pundir <amit.pundir(a)linaro.org>
Bluetooth: hci_qca: Avoid missing rampatch failure with userspace fw loader
Corey Minyard <cminyard(a)mvista.com>
ipmi:bt: Set the timeout before doing a capabilities check
Mikulas Patocka <mpatocka(a)redhat.com>
branch-check: fix long->int truncation when profiling branches
Matthias Schiffer <mschiffer(a)universe-factory.net>
mips: ftrace: fix static function graph tracing
Geert Uytterhoeven <geert+renesas(a)glider.be>
lib/vsprintf: Remove atomic-unsafe support for %pCr
Alexander Sverdlin <alexander.sverdlin(a)gmail.com>
ASoC: cirrus: i2s: Fix {TX|RX}LinCtrlData setup
Alexander Sverdlin <alexander.sverdlin(a)gmail.com>
ASoC: cirrus: i2s: Fix LRCLK configuration
Srinivas Kandagatla <srinivas.kandagatla(a)linaro.org>
ASoC: dapm: delete dapm_kcontrol_data paths list before freeing it
Ingo Flaschberger <ingo.flaschberger(a)gmail.com>
1wire: family module autoload fails because of upper/lower case mismatch.
Maxim Moseychuk <franchesko.salias.hudro.pedros(a)gmail.com>
usb: do not reset if a low-speed or full-speed device timed out
Eric W. Biederman <ebiederm(a)xmission.com>
signal/xtensa: Consistenly use SIGBUS in do_unaligned_user
Daniel Wagner <daniel.wagner(a)siemens.com>
serial: sh-sci: Use spin_{try}lock_irqsave instead of open coding version
Michael Schmitz <schmitzmic(a)gmail.com>
m68k/mm: Adjust VM area to be unmapped by gap size for __iounmap()
Dan Williams <dan.j.williams(a)intel.com>
x86/spectre_v1: Disable compiler optimizations over array_index_mask_nospec()
Thadeu Lima de Souza Cascardo <cascardo(a)canonical.com>
fs/binfmt_misc.c: do not allow offset overflow
Stefan Potyra <Stefan.Potyra(a)elektrobit.com>
w1: mxc_w1: Enable clock before calling clk_get_rate() on it
Hans de Goede <hdegoede(a)redhat.com>
libata: Drop SanDisk SD7UB3Q*G1001 NOLPM quirk
Dan Carpenter <dan.carpenter(a)oracle.com>
libata: zpodd: small read overflow in eject_tray()
Colin Ian King <colin.king(a)canonical.com>
libata: zpodd: make arrays cdb static, reduces object code size
Tao Wang <kevin.wangtao(a)hisilicon.com>
cpufreq: Fix new policy initialization during limits updates via sysfs
Dennis Wassenberg <dennis.wassenberg(a)secunet.com>
ALSA: hda: add dock and led support for HP ProBook 640 G4
Dennis Wassenberg <dennis.wassenberg(a)secunet.com>
ALSA: hda: add dock and led support for HP EliteBook 830 G5
Bo Chen <chenbo(a)pdx.edu>
ALSA: hda - Handle kzalloc() failure in snd_hda_attach_pcm_stream()
Qu Wenruo <wqu(a)suse.com>
btrfs: scrub: Don't use inode pages for device replace
Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
driver core: Don't ignore class_dir_create_and_add() failure.
Jan Kara <jack(a)suse.cz>
ext4: fix fencepost error in check for inode count overflow during resize
Lukas Czerner <lczerner(a)redhat.com>
ext4: update mtime in ext4_punch_hole even if no blocks are released
Frank van der Linden <fllinden(a)amazon.com>
tcp: verify the checksum of the first data segment in a new connection
Xiangning Yu <yuxiangning(a)gmail.com>
bonding: re-evaluate force_primary when the primary slave name changes
Daniel Glöckner <dg(a)emlix.com>
usb: musb: fix remote wakeup racing with suspend
Liu Bo <bo.li.liu(a)oracle.com>
Btrfs: make raid6 rebuild retry more
Eric Dumazet <edumazet(a)google.com>
tcp: do not overshoot window_clamp in tcp_rcv_space_adjust()
Sasha Levin <Alexander.Levin(a)microsoft.com>
Revert "Btrfs: fix scrub to repair raid6 corruption"
Finn Thain <fthain(a)telegraphics.com.au>
net/sonic: Use dma_mapping_error()
Josh Hill <josh(a)joshuajhill.com>
net: qmi_wwan: Add Netgear Aircard 779S
Ivan Bornyakov <brnkv.i1(a)gmail.com>
atm: zatm: fix memcmp casting
Julian Anastasov <ja(a)ssi.bg>
ipvs: fix buffer overflow with sync daemon and service
Paolo Abeni <pabeni(a)redhat.com>
netfilter: ebtables: handle string from userspace with care
Eric Dumazet <edumazet(a)google.com>
xfrm6: avoid potential infinite loop in _decode_session6()
-------------
Diffstat:
Documentation/printk-formats.txt | 3 +-
Makefile | 4 +-
arch/arm/include/asm/kgdb.h | 2 +-
arch/m68k/mm/kmap.c | 3 +-
arch/mips/bcm47xx/setup.c | 6 +
arch/mips/include/asm/io.h | 2 +
arch/mips/include/asm/mipsregs.h | 3 +
arch/mips/kernel/mcount.S | 27 ++---
arch/powerpc/kernel/entry_64.S | 1 +
arch/powerpc/kernel/fadump.c | 3 +
arch/powerpc/kernel/hw_breakpoint.c | 4 +-
arch/powerpc/kernel/ptrace.c | 1 +
arch/x86/include/asm/barrier.h | 2 +-
arch/xtensa/kernel/traps.c | 2 +-
drivers/ata/libata-core.c | 3 -
drivers/ata/libata-zpodd.c | 4 +-
drivers/atm/zatm.c | 4 +-
drivers/base/core.c | 14 ++-
drivers/bluetooth/hci_qca.c | 6 +
drivers/char/ipmi/ipmi_bt_sm.c | 3 +-
drivers/cpufreq/cpufreq.c | 2 +
drivers/cpuidle/cpuidle-powernv.c | 32 +++++-
drivers/iio/buffer/kfifo_buf.c | 4 +-
drivers/infiniband/hw/mlx4/mad.c | 1 -
drivers/infiniband/hw/qib/qib.h | 3 +-
drivers/infiniband/hw/qib/qib_file_ops.c | 10 +-
drivers/infiniband/hw/qib/qib_user_pages.c | 20 ++--
drivers/input/mouse/elan_i2c.h | 2 +
drivers/input/mouse/elan_i2c_core.c | 3 +-
drivers/input/mouse/elan_i2c_smbus.c | 10 +-
drivers/input/mouse/elantech.c | 11 +-
drivers/md/dm-thin.c | 11 +-
drivers/md/md.c | 4 +-
drivers/media/dvb-core/dvb_frontend.c | 23 ++--
drivers/media/usb/cx231xx/cx231xx-cards.c | 3 +
drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 2 +-
drivers/mfd/intel-lpss.c | 4 +-
drivers/mtd/chips/cfi_cmdset_0002.c | 21 ++--
drivers/mtd/ubi/build.c | 3 +
drivers/mtd/ubi/wl.c | 4 +-
drivers/net/bonding/bond_options.c | 1 +
drivers/net/ethernet/natsemi/sonic.c | 2 +-
drivers/net/usb/cdc_ncm.c | 4 +-
drivers/net/usb/qmi_wwan.c | 1 +
drivers/nvdimm/bus.c | 14 ++-
drivers/of/unittest.c | 8 +-
drivers/pci/hotplug/pciehp.h | 2 +-
drivers/pci/hotplug/pciehp_core.c | 2 +-
drivers/pci/hotplug/pciehp_hpc.c | 13 ++-
drivers/s390/scsi/zfcp_dbf.c | 40 +++++++
drivers/s390/scsi/zfcp_erp.c | 123 ++++++++++++++++-----
drivers/s390/scsi/zfcp_ext.h | 5 +
drivers/s390/scsi/zfcp_scsi.c | 18 ++-
drivers/scsi/qla2xxx/qla_init.c | 3 +-
drivers/spi/spi.c | 10 +-
drivers/tty/serial/sh-sci.c | 8 +-
drivers/usb/core/hub.c | 4 +-
drivers/usb/musb/musb_host.c | 5 +-
drivers/usb/musb/musb_host.h | 7 +-
drivers/usb/musb/musb_virthub.c | 25 +++--
drivers/video/backlight/as3711_bl.c | 33 ++++--
drivers/video/backlight/max8925_bl.c | 4 +-
drivers/video/backlight/tps65217_bl.c | 4 +-
drivers/video/fbdev/uvesafb.c | 3 +-
drivers/w1/masters/mxc_w1.c | 20 ++--
drivers/w1/w1.c | 2 +-
drivers/xen/events/events_base.c | 2 -
fs/binfmt_misc.c | 12 +-
fs/btrfs/inode.c | 33 +++++-
fs/btrfs/ioctl.c | 12 +-
fs/btrfs/scrub.c | 2 +-
fs/ext4/inode.c | 36 +++---
fs/ext4/resize.c | 2 +-
fs/fuse/control.c | 13 ++-
fs/fuse/dir.c | 13 ++-
fs/fuse/inode.c | 1 +
fs/nfs/nfs4idmap.c | 5 +-
fs/nfsd/nfs4xdr.c | 5 +-
fs/ubifs/journal.c | 2 +-
fs/udf/directory.c | 3 +
include/linux/blkdev.h | 4 +-
include/linux/compiler.h | 2 +-
include/linux/iio/buffer.h | 6 +-
include/net/bluetooth/hci_core.h | 2 +-
kernel/time/time.c | 6 +-
lib/vsprintf.c | 3 -
net/bluetooth/hci_conn.c | 27 +++--
net/bluetooth/hci_event.c | 15 ++-
net/bridge/netfilter/ebtables.c | 3 +-
net/ipv4/tcp_input.c | 2 +-
net/ipv4/tcp_ipv4.c | 4 +
net/ipv6/tcp_ipv6.c | 4 +
net/ipv6/xfrm6_policy.c | 2 +-
net/netfilter/ipvs/ip_vs_ctl.c | 21 +++-
net/xfrm/xfrm_policy.c | 5 +
sound/pci/hda/hda_controller.c | 4 +-
sound/pci/hda/patch_conexant.c | 2 +
sound/pci/hda/patch_realtek.c | 1 +
sound/soc/cirrus/edb93xx.c | 2 +-
sound/soc/cirrus/ep93xx-i2s.c | 26 +++--
sound/soc/cirrus/snappercl15.c | 2 +-
sound/soc/soc-dapm.c | 2 +
tools/perf/util/dso.c | 2 +
.../perf/util/intel-pt-decoder/intel-pt-decoder.c | 23 +++-
.../perf/util/intel-pt-decoder/intel-pt-decoder.h | 9 ++
.../util/intel-pt-decoder/intel-pt-pkt-decoder.c | 2 +-
tools/perf/util/intel-pt.c | 5 +
107 files changed, 685 insertions(+), 273 deletions(-)
Hi Greg,
Please consider backporting commit 7ec916f82c48, which fixes an issue
with iwlwifi module loading in some cases. Fabio initially reported the
issue and confirmed reverting fixed the problem, and it has also been
reported by at least one Fedora user[0] as fixing the problem.
Thanks!
[0] https://bugzilla.redhat.com/show_bug.cgi?id=1607092
A VM which has:
- a DMA capable device passed through to it (eg. network card);
- running a malicious kernel that ignores H_PUT_TCE failure;
- capability of using IOMMU pages bigger that physical pages
can create an IOMMU mapping that exposes (for example) 16MB of
the host physical memory to the device when only 64K was allocated to the VM.
The remaining 16MB - 64K will be some other content of host memory, possibly
including pages of the VM, but also pages of host kernel memory, host
programs or other VMs.
The attacking VM does not control the location of the page it can map,
and is only allowed to map as many pages as it has pages of RAM.
We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
an IOMMU page is contained in the physical page so the PCI hardware won't
get access to unassigned host memory; however this check is missing in
the KVM fastpath (H_PUT_TCE accelerated code). We were lucky so far and
did not hit this yet as the very first time when the mapping happens
we do not have tbl::it_userspace allocated yet and fall back to
the userspace which in turn calls VFIO IOMMU driver, this fails and
the guest does not retry,
This stores the smallest preregistered page size in the preregistered
region descriptor and changes the mm_iommu_xxx API to check this against
the IOMMU page size.
This calculates maximum page size as a minimum of the natural region
alignment and compound page size. For the page shift this uses the shift
returned by find_linux_pte() which indicates how the page is mapped to
the current userspace - if the page is huge and this is not a zero, then
it is a leaf pte and the page is mapped within the range.
Fixes: 121f80ba68f1 ("KVM: PPC: VFIO: Add in-kernel acceleration for VFIO")
Cc: stable(a)vger.kernel.org # v4.12+
Signed-off-by: Alexey Kardashevskiy <aik(a)ozlabs.ru>
Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
(cherry picked from commit 76fa4975f3ed12d15762bc979ca44078598ed8ee)
Signed-off-by: Alexey Kardashevskiy <aik(a)ozlabs.ru>
---
The original patch did not apply because of fad953ce which fixed
all vmalloc's to use array_size() so the backport is pretty trivial
and applies to v4.17 stable as well.
---
arch/powerpc/include/asm/mmu_context.h | 4 ++--
arch/powerpc/kvm/book3s_64_vio.c | 2 +-
arch/powerpc/kvm/book3s_64_vio_hv.c | 6 ++++--
arch/powerpc/mm/mmu_context_iommu.c | 37 ++++++++++++++++++++++++++++++++--
drivers/vfio/vfio_iommu_spapr_tce.c | 2 +-
5 files changed, 43 insertions(+), 8 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 44fdf47..6f67ff5 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -35,9 +35,9 @@ extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned long *hpa);
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa);
extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned long *hpa);
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa);
extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
#endif
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 4dffa61..e14cec6 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -433,7 +433,7 @@ long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
/* This only handles v2 IOMMU type, v1 is handled via ioctl() */
return H_TOO_HARD;
- if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
+ if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
return H_HARDWARE;
if (mm_iommu_mapped_inc(mem))
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index c32e9bfe..648cf6c 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -262,7 +262,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl,
if (!mem)
return H_TOO_HARD;
- if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa)))
+ if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
+ &hpa)))
return H_HARDWARE;
pua = (void *) vmalloc_to_phys(pua);
@@ -431,7 +432,8 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
if (mem)
- prereg = mm_iommu_ua_to_hpa_rm(mem, ua, &tces) == 0;
+ prereg = mm_iommu_ua_to_hpa_rm(mem, ua,
+ IOMMU_PAGE_SHIFT_4K, &tces) == 0;
}
if (!prereg) {
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index e0a2d8e..8160559 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -19,6 +19,7 @@
#include <linux/hugetlb.h>
#include <linux/swap.h>
#include <asm/mmu_context.h>
+#include <asm/pte-walk.h>
static DEFINE_MUTEX(mem_list_mutex);
@@ -27,6 +28,7 @@ struct mm_iommu_table_group_mem_t {
struct rcu_head rcu;
unsigned long used;
atomic64_t mapped;
+ unsigned int pageshift;
u64 ua; /* userspace address */
u64 entries; /* number of entries in hpas[] */
u64 *hpas; /* vmalloc'ed */
@@ -126,6 +128,8 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
{
struct mm_iommu_table_group_mem_t *mem;
long i, j, ret = 0, locked_entries = 0;
+ unsigned int pageshift;
+ unsigned long flags;
struct page *page = NULL;
mutex_lock(&mem_list_mutex);
@@ -160,6 +164,12 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
goto unlock_exit;
}
+ /*
+ * For a starting point for a maximum page size calculation
+ * we use @ua and @entries natural alignment to allow IOMMU pages
+ * smaller than huge pages but still bigger than PAGE_SIZE.
+ */
+ mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT));
mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
if (!mem->hpas) {
kfree(mem);
@@ -200,6 +210,23 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
}
}
populate:
+ pageshift = PAGE_SHIFT;
+ if (PageCompound(page)) {
+ pte_t *pte;
+ struct page *head = compound_head(page);
+ unsigned int compshift = compound_order(head);
+
+ local_irq_save(flags); /* disables as well */
+ pte = find_linux_pte(mm->pgd, ua, NULL, &pageshift);
+ local_irq_restore(flags);
+
+ /* Double check it is still the same pinned page */
+ if (pte && pte_page(*pte) == head &&
+ pageshift == compshift)
+ pageshift = max_t(unsigned int, pageshift,
+ PAGE_SHIFT);
+ }
+ mem->pageshift = min(mem->pageshift, pageshift);
mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
}
@@ -350,7 +377,7 @@ struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
EXPORT_SYMBOL_GPL(mm_iommu_find);
long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned long *hpa)
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
{
const long entry = (ua - mem->ua) >> PAGE_SHIFT;
u64 *va = &mem->hpas[entry];
@@ -358,6 +385,9 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
if (entry >= mem->entries)
return -EFAULT;
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
*hpa = *va | (ua & ~PAGE_MASK);
return 0;
@@ -365,7 +395,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
- unsigned long ua, unsigned long *hpa)
+ unsigned long ua, unsigned int pageshift, unsigned long *hpa)
{
const long entry = (ua - mem->ua) >> PAGE_SHIFT;
void *va = &mem->hpas[entry];
@@ -374,6 +404,9 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
if (entry >= mem->entries)
return -EFAULT;
+ if (pageshift > mem->pageshift)
+ return -EFAULT;
+
pa = (void *) vmalloc_to_phys(va);
if (!pa)
return -EFAULT;
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index b751dd6..b4c68f3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -467,7 +467,7 @@ static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
if (!mem)
return -EINVAL;
- ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
+ ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
if (ret)
return -EINVAL;
--
2.11.0
xen/PVH: Set up GS segment for stack canary
commit 98014068328c5574de9a4a30b604111fd9d8f901 upstream
A 32bit PVH Xen kernel with CONFIG_CC_STACKPROTECTOR_STRONG fails to
boot. Xen detects a triple fault and kills the domain. The IP was
xen_prepare_pvh+9 corresponding to:
mov %gs:0x14,%eax
The 32bit kernel hasn't setup %gs when calling into xen_prepare_pvh.
Curiously, 64bit was not affected. The requested patch sets up the
canary for PVH to boot successfully.
This is applicable to and has been tested on 4.14. It is also
applicable to 4.17.
Thanks,
Jason
The patch
ASoC: zte: Fix incorrect PCM format bit usages
has been applied to the asoc tree at
https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git
All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.
You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.
If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.
Please add any relevant lists and maintainers to the CCs when replying
to this mail.
Thanks,
Mark
>From c889a45d229938a94b50aadb819def8bb11a6a54 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai(a)suse.de>
Date: Wed, 25 Jul 2018 22:40:49 +0200
Subject: [PATCH] ASoC: zte: Fix incorrect PCM format bit usages
zx-tdm driver sets the DAI driver definitions with the format bits
wrongly set with SNDRV_PCM_FORMAT_*, instead of SNDRV_PCM_FMTBIT_*.
This patch corrects the definitions.
Spotted by a sparse warning:
sound/soc/zte/zx-tdm.c:363:35: warning: restricted snd_pcm_format_t degrades to integer
Fixes: 870e0ddc4345 ("ASoC: zx-tdm: add zte's tdm controller driver")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
Signed-off-by: Mark Brown <broonie(a)kernel.org>
---
sound/soc/zte/zx-tdm.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sound/soc/zte/zx-tdm.c b/sound/soc/zte/zx-tdm.c
index dc955272f58b..389272eeba9a 100644
--- a/sound/soc/zte/zx-tdm.c
+++ b/sound/soc/zte/zx-tdm.c
@@ -144,8 +144,8 @@ static void zx_tdm_rx_dma_en(struct zx_tdm_info *tdm, bool on)
#define ZX_TDM_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000)
#define ZX_TDM_FMTBIT \
- (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FORMAT_MU_LAW | \
- SNDRV_PCM_FORMAT_A_LAW)
+ (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_MU_LAW | \
+ SNDRV_PCM_FMTBIT_A_LAW)
static int zx_tdm_dai_probe(struct snd_soc_dai *dai)
{
--
2.18.0
Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
allows the writeback rate to be faster if there is no I/O request on a
bcache device. It works well if there is only one bcache device attached
to the cache set. If there are many bcache devices attached to a cache
set, it may introduce performance regression because multiple faster
writeback threads of the idle bcache devices will compete the btree level
locks with the bcache device who have I/O requests coming.
This patch fixes the above issue by only permitting fast writebac when
all bcache devices attached on the cache set are idle. And if one of the
bcache devices has new I/O request coming, minimized all writeback
throughput immediately and let PI controller __update_writeback_rate()
to decide the upcoming writeback rate for each bcache device.
Also when all bcache devices are idle, limited wrieback rate to a small
number is wast of thoughput, especially when backing devices are slower
non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
rate for each backing device if the whole cache set is idle. A faster
writeback rate in idle time means new I/Os may have more available space
for dirty data, and people may observe a better write performance then.
Please note bcache may change its cache mode in run time, and this patch
still works if the cache mode is switched from writeback mode and there
is still dirty data on cache.
Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
Cc: stable(a)vger.kernel.org #4.16+
Signed-off-by: Coly Li <colyli(a)suse.de>
Tested-by: Kai Krakow <kai(a)kaishome.de>
Tested-by: Stefan Priebe <s.priebe(a)profihost.ag>
Cc: Michael Lyle <mlyle(a)lyle.org>
---
drivers/md/bcache/bcache.h | 10 ++--
drivers/md/bcache/request.c | 54 ++++++++++++++++++++-
drivers/md/bcache/super.c | 4 ++
drivers/md/bcache/sysfs.c | 15 ++++--
drivers/md/bcache/util.c | 2 +-
drivers/md/bcache/util.h | 2 +-
drivers/md/bcache/writeback.c | 91 +++++++++++++++++++++++------------
7 files changed, 134 insertions(+), 44 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5f7082aab1b0..97489573dedc 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
*/
atomic_t has_dirty;
- /*
- * Set to zero by things that touch the backing volume-- except
- * writeback. Incremented by writeback. Used to determine when to
- * accelerate idle writeback.
- */
- atomic_t backing_idle;
-
struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
@@ -515,6 +508,8 @@ struct cache_set {
struct cache_accounting accounting;
unsigned long flags;
+ atomic_t idle_counter;
+ atomic_t at_max_writeback_rate;
struct cache_sb sb;
@@ -524,6 +519,7 @@ struct cache_set {
struct bcache_device **devices;
unsigned devices_max_used;
+ atomic_t attached_dev_nr;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
atomic_long_t flash_dev_dirty_sectors;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 91206f329971..86a977c2a176 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1105,6 +1105,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
generic_make_request(bio);
}
+static void quit_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *this_dc)
+{
+ int i;
+ struct bcache_device *d;
+ struct cached_dev *dc;
+
+ /*
+ * mutex bch_register_lock may compete with other parallel requesters,
+ * or attach/detach operations on other backing device. Waiting to
+ * the mutex lock may increase I/O request latency for seconds or more.
+ * To avoid such situation, if mutext_trylock() failed, only writeback
+ * rate of current cached device is set to 1, and __update_write_back()
+ * will decide writeback rate of other cached devices (remember now
+ * c->idle_counter is 0 already).
+ */
+ if (mutex_trylock(&bch_register_lock)) {
+ for (i = 0; i < c->devices_max_used; i++) {
+ if (!c->devices[i])
+ continue;
+
+ if (UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+
+ d = c->devices[i];
+ dc = container_of(d, struct cached_dev, disk);
+ /*
+ * set writeback rate to default minimum value,
+ * then let update_writeback_rate() to decide the
+ * upcoming rate.
+ */
+ atomic_long_set(&dc->writeback_rate.rate, 1);
+ }
+ mutex_unlock(&bch_register_lock);
+ } else
+ atomic_long_set(&this_dc->writeback_rate.rate, 1);
+}
+
/* Cached devices - read & write stuff */
static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1122,7 +1160,21 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- atomic_set(&dc->backing_idle, 0);
+ if (likely(d->c)) {
+ if (atomic_read(&d->c->idle_counter))
+ atomic_set(&d->c->idle_counter, 0);
+ /*
+ * If at_max_writeback_rate of cache set is true and new I/O
+ * comes, quit max writeback rate of all cached devices
+ * attached to this cache set, and set at_max_writeback_rate
+ * to false.
+ */
+ if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+ atomic_set(&d->c->at_max_writeback_rate, 0);
+ quit_max_writeback_rate(d->c, dc);
+ }
+ }
+
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index f517d7d1fa10..32b95f3b9461 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -696,6 +696,8 @@ static void bcache_device_detach(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
+ atomic_dec(&d->c->attached_dev_nr);
+
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
struct uuid_entry *u = d->c->uuids + d->id;
@@ -1144,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
bch_cached_dev_run(dc);
bcache_device_link(&dc->disk, c, "bdev");
+ atomic_inc(&c->attached_dev_nr);
/* Allow the writeback thread to proceed */
up_write(&dc->writeback_lock);
@@ -1696,6 +1699,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
+ atomic_set(&c->attached_dev_nr, 0);
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 3e9d3459a224..6e88142514fb 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -171,7 +171,8 @@ SHOW(__bch_cached_dev)
var_printf(writeback_running, "%i");
var_print(writeback_delay);
var_print(writeback_percent);
- sysfs_hprint(writeback_rate, wb ? dc->writeback_rate.rate << 9 : 0);
+ sysfs_hprint(writeback_rate,
+ wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0);
sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
sysfs_printf(io_error_limit, "%i", dc->error_limit);
sysfs_printf(io_disable, "%i", dc->io_disable);
@@ -193,7 +194,9 @@ SHOW(__bch_cached_dev)
* Except for dirty and target, other values should
* be 0 if writeback is not running.
*/
- bch_hprint(rate, wb ? dc->writeback_rate.rate << 9 : 0);
+ bch_hprint(rate,
+ wb ? atomic_long_read(&dc->writeback_rate.rate) << 9
+ : 0);
bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
bch_hprint(proportional,
@@ -261,8 +264,12 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
- sysfs_strtoul_clamp(writeback_rate,
- dc->writeback_rate.rate, 1, INT_MAX);
+ if (attr == &sysfs_writeback_rate) {
+ int v;
+
+ sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
+ atomic_long_set(&dc->writeback_rate.rate, v);
+ }
sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index f912c372978c..c6a99dfa1ad9 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
- d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+ d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
/* Bound the time. Don't let us fall further than 2 seconds behind
* (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index a1579e28049f..5ff055f0a653 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -443,7 +443,7 @@ struct bch_ratelimit {
* Rate at which we want to do work, in units per second
* The units here correspond to the units passed to bch_next_delay()
*/
- uint32_t rate;
+ atomic_long_t rate;
};
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 912e969fedba..481d4cf38ac0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
dc->writeback_rate_proportional = proportional_scaled;
dc->writeback_rate_integral_scaled = integral_scaled;
- dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
- dc->writeback_rate.rate = new_rate;
+ dc->writeback_rate_change = new_rate -
+ atomic_long_read(&dc->writeback_rate.rate);
+ atomic_long_set(&dc->writeback_rate.rate, new_rate);
dc->writeback_rate_target = target;
}
+static bool set_at_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *dc)
+{
+ /*
+ * Idle_counter is increased everytime when update_writeback_rate() is
+ * called. If all backing devices attached to the same cache set have
+ * identical dc->writeback_rate_update_seconds values, it is about 6
+ * rounds of update_writeback_rate() on each backing device before
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+ * to each dc->writeback_rate.rate.
+ * In order to avoid extra locking cost for counting exact dirty cached
+ * devices number, c->attached_dev_nr is used to calculate the idle
+ * throushold. It might be bigger if not all cached device are in write-
+ * back mode, but it still works well with limited extra rounds of
+ * update_writeback_rate().
+ */
+ if (atomic_inc_return(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6)
+ return false;
+
+ if (atomic_read(&c->at_max_writeback_rate) != 1)
+ atomic_set(&c->at_max_writeback_rate, 1);
+
+ atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
+
+ /* keep writeback_rate_target as existing value */
+ dc->writeback_rate_proportional = 0;
+ dc->writeback_rate_integral_scaled = 0;
+ dc->writeback_rate_change = 0;
+
+ /*
+ * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+ * new I/O arrives during before set_at_max_writeback_rate() returns.
+ * Then the writeback rate is set to 1, and its new value should be
+ * decided via __update_writeback_rate().
+ */
+ if ((atomic_read(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6) ||
+ !atomic_read(&c->at_max_writeback_rate))
+ return false;
+
+ return true;
+}
+
static void update_writeback_rate(struct work_struct *work)
{
struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
return;
}
- down_read(&dc->writeback_lock);
-
- if (atomic_read(&dc->has_dirty) &&
- dc->writeback_percent)
- __update_writeback_rate(dc);
+ if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+ /*
+ * If the whole cache set is idle, set_at_max_writeback_rate()
+ * will set writeback rate to a max number. Then it is
+ * unncessary to update writeback rate for an idle cache set
+ * in maximum writeback rate number(s).
+ */
+ if (!set_at_max_writeback_rate(c, dc)) {
+ down_read(&dc->writeback_lock);
+ __update_writeback_rate(dc);
+ up_read(&dc->writeback_lock);
+ }
+ }
- up_read(&dc->writeback_lock);
/*
* CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
delay = writeback_delay(dc, size);
- /* If the control system would wait for at least half a
- * second, and there's been no reqs hitting the backing disk
- * for awhile: use an alternate mode where we have at most
- * one contiguous set of writebacks in flight at a time. If
- * someone wants to do IO it will be quick, as it will only
- * have to contend with one operation in flight, and we'll
- * be round-tripping data to the backing disk as quickly as
- * it can accept it.
- */
- if (delay >= HZ / 2) {
- /* 3 means at least 1.5 seconds, up to 7.5 if we
- * have slowed way down.
- */
- if (atomic_inc_return(&dc->backing_idle) >= 3) {
- /* Wait for current I/Os to finish */
- closure_sync(&cl);
- /* And immediately launch a new set. */
- delay = 0;
- }
- }
-
while (!kthread_should_stop() &&
!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
delay) {
@@ -741,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_running = true;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
- dc->writeback_rate.rate = 1024;
+ atomic_long_set(&dc->writeback_rate.rate, 1024);
dc->writeback_rate_minimum = 8;
dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
--
2.17.1
There is a window for racing when printing directly to task->comm,
allowing other threads to see a non-terminated string. The vsnprintf
function fills the buffer, counts the truncated chars, then finally
writes the \0 at the end.
creator other
vsnprintf:
fill (not terminated)
count the rest trace_sched_waking(p):
... memcpy(comm, p->comm, TASK_COMM_LEN)
write \0
The consequences depend on how 'other' uses the string. In our case,
it was copied into the tracing system's saved cmdlines, a buffer of
adjacent TASK_COMM_LEN-byte buffers (note the 'n' where 0 should be):
crash-arm64> x/1024s savedcmd->saved_cmdlines | grep 'evenk'
0xffffffd5b3818640: "irq/497-pwr_evenkworker/u16:12"
...and a strcpy out of there would cause stack corruption:
[224761.522292] Kernel panic - not syncing: stack-protector:
Kernel stack is corrupted in: ffffff9bf9783c78
crash-arm64> kbt | grep 'comm\|trace_print_context'
#6 0xffffff9bf9783c78 in trace_print_context+0x18c(+396)
comm (char [16]) = "irq/497-pwr_even"
crash-arm64> rd 0xffffffd4d0e17d14 8
ffffffd4d0e17d14: 2f71726900000000 5f7277702d373934 ....irq/497-pwr_
ffffffd4d0e17d24: 726f776b6e657665 3a3631752f72656b evenkworker/u16:
ffffffd4d0e17d34: f9780248ff003231 cede60e0ffffff9b 12..H.x......`..
ffffffd4d0e17d44: cede60c8ffffffd4 00000fffffffffd4 .....`..........
The workaround in e09e28671 (use strlcpy in __trace_find_cmdline) was
likely needed because of this same bug.
Solved by vsnprintf:ing to a local buffer, then using set_task_comm().
This way, there won't be a window where comm is not terminated.
Cc: stable(a)vger.kernel.org
Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure")
Reviewed-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
Signed-off-by: Snild Dolkow <snild(a)sony.com>
---
kernel/kthread.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 481951bf091d..1a481ae12dec 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -319,8 +319,14 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
task = create->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
+ char name[TASK_COMM_LEN];
- vsnprintf(task->comm, sizeof(task->comm), namefmt, args);
+ /*
+ * task is already visible to other tasks, so updating
+ * COMM must be protected.
+ */
+ vsnprintf(name, sizeof(name), namefmt, args);
+ set_task_comm(task, name);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
--
2.15.1
Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
allows the writeback rate to be faster if there is no I/O request on a
bcache device. It works well if there is only one bcache device attached
to the cache set. If there are many bcache devices attached to a cache
set, it may introduce performance regression because multiple faster
writeback threads of the idle bcache devices will compete the btree level
locks with the bcache device who have I/O requests coming.
This patch fixes the above issue by only permitting fast writebac when
all bcache devices attached on the cache set are idle. And if one of the
bcache devices has new I/O request coming, minimized all writeback
throughput immediately and let PI controller __update_writeback_rate()
to decide the upcoming writeback rate for each bcache device.
Also when all bcache devices are idle, limited wrieback rate to a small
number is wast of thoughput, especially when backing devices are slower
non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
rate for each backing device if the whole cache set is idle. A faster
writeback rate in idle time means new I/Os may have more available space
for dirty data, and people may observe a better write performance then.
Please note bcache may change its cache mode in run time, and this patch
still works if the cache mode is switched from writeback mode and there
is still dirty data on cache.
Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
Cc: stable(a)vger.kernel.org #4.16+
Signed-off-by: Coly Li <colyli(a)suse.de>
Tested-by: Kai Krakow <kai(a)kaishome.de>
Cc: Michael Lyle <mlyle(a)lyle.org>
Cc: Stefan Priebe <s.priebe(a)profihost.ag>
---
Channgelog:
v3, Do not acquire bch_register_lock in set_at_max_writeback_rate().
v2, Fix a deadlock reported by Stefan Priebe.
v1, Initial version.
drivers/md/bcache/bcache.h | 10 ++--
drivers/md/bcache/request.c | 54 ++++++++++++++++++++-
drivers/md/bcache/super.c | 4 ++
drivers/md/bcache/sysfs.c | 14 ++++--
drivers/md/bcache/util.c | 2 +-
drivers/md/bcache/util.h | 2 +-
drivers/md/bcache/writeback.c | 91 +++++++++++++++++++++++------------
7 files changed, 133 insertions(+), 44 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 872ef4d67711..13f908be42ba 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
*/
atomic_t has_dirty;
- /*
- * Set to zero by things that touch the backing volume-- except
- * writeback. Incremented by writeback. Used to determine when to
- * accelerate idle writeback.
- */
- atomic_t backing_idle;
-
struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update;
@@ -515,6 +508,8 @@ struct cache_set {
struct cache_accounting accounting;
unsigned long flags;
+ atomic_t idle_counter;
+ atomic_t at_max_writeback_rate;
struct cache_sb sb;
@@ -524,6 +519,7 @@ struct cache_set {
struct bcache_device **devices;
unsigned devices_max_used;
+ atomic_t attached_dev_nr;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
atomic_long_t flash_dev_dirty_sectors;
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 8eece9ef9f46..26f97acde403 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1105,6 +1105,44 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
generic_make_request(bio);
}
+static void quit_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *this_dc)
+{
+ int i;
+ struct bcache_device *d;
+ struct cached_dev *dc;
+
+ /*
+ * mutex bch_register_lock may compete with other parallel requesters,
+ * or attach/detach operations on other backing device. Waiting to
+ * the mutex lock may increase I/O request latency for seconds or more.
+ * To avoid such situation, if mutext_trylock() failed, only writeback
+ * rate of current cached device is set to 1, and __update_write_back()
+ * will decide writeback rate of other cached devices (remember now
+ * c->idle_counter is 0 already).
+ */
+ if (mutex_trylock(&bch_register_lock)) {
+ for (i = 0; i < c->devices_max_used; i++) {
+ if (!c->devices[i])
+ continue;
+
+ if (UUID_FLASH_ONLY(&c->uuids[i]))
+ continue;
+
+ d = c->devices[i];
+ dc = container_of(d, struct cached_dev, disk);
+ /*
+ * set writeback rate to default minimum value,
+ * then let update_writeback_rate() to decide the
+ * upcoming rate.
+ */
+ atomic_long_set(&dc->writeback_rate.rate, 1);
+ }
+ mutex_unlock(&bch_register_lock);
+ } else
+ atomic_long_set(&this_dc->writeback_rate.rate, 1);
+}
+
/* Cached devices - read & write stuff */
static blk_qc_t cached_dev_make_request(struct request_queue *q,
@@ -1122,7 +1160,21 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- atomic_set(&dc->backing_idle, 0);
+ if (likely(d->c)) {
+ if (atomic_read(&d->c->idle_counter))
+ atomic_set(&d->c->idle_counter, 0);
+ /*
+ * If at_max_writeback_rate of cache set is true and new I/O
+ * comes, quit max writeback rate of all cached devices
+ * attached to this cache set, and set at_max_writeback_rate
+ * to false.
+ */
+ if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+ atomic_set(&d->c->at_max_writeback_rate, 0);
+ quit_max_writeback_rate(d->c, dc);
+ }
+ }
+
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e0a92104ca23..8db6696e2bff 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -696,6 +696,8 @@ static void bcache_device_detach(struct bcache_device *d)
{
lockdep_assert_held(&bch_register_lock);
+ atomic_dec(&d->c->attached_dev_nr);
+
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
struct uuid_entry *u = d->c->uuids + d->id;
@@ -1144,6 +1146,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
bch_cached_dev_run(dc);
bcache_device_link(&dc->disk, c, "bdev");
+ atomic_inc(&c->attached_dev_nr);
/* Allow the writeback thread to proceed */
up_write(&dc->writeback_lock);
@@ -1695,6 +1698,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
c->devices_max_used = 0;
+ atomic_set(&c->attached_dev_nr, 0);
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 225b15aa0340..a56067e80b10 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
var_printf(writeback_running, "%i");
var_print(writeback_delay);
var_print(writeback_percent);
- sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9);
+ sysfs_hprint(writeback_rate,
+ atomic_long_read(&dc->writeback_rate.rate) << 9);
sysfs_hprint(io_errors, atomic_read(&dc->io_errors));
sysfs_printf(io_error_limit, "%i", dc->error_limit);
sysfs_printf(io_disable, "%i", dc->io_disable);
@@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
char change[20];
s64 next_io;
- bch_hprint(rate, dc->writeback_rate.rate << 9);
+ bch_hprint(rate,
+ atomic_long_read(&dc->writeback_rate.rate) << 9);
bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
bch_hprint(proportional,dc->writeback_rate_proportional << 9);
@@ -255,8 +257,12 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
- sysfs_strtoul_clamp(writeback_rate,
- dc->writeback_rate.rate, 1, INT_MAX);
+ if (attr == &sysfs_writeback_rate) {
+ int v;
+
+ sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
+ atomic_long_set(&dc->writeback_rate.rate, v);
+ }
sysfs_strtoul_clamp(writeback_rate_update_seconds,
dc->writeback_rate_update_seconds,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index f912c372978c..c6a99dfa1ad9 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
uint64_t now = local_clock();
- d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+ d->next += div_u64(done * NSEC_PER_SEC, atomic_long_read(&d->rate));
/* Bound the time. Don't let us fall further than 2 seconds behind
* (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index a1579e28049f..5ff055f0a653 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -443,7 +443,7 @@ struct bch_ratelimit {
* Rate at which we want to do work, in units per second
* The units here correspond to the units passed to bch_next_delay()
*/
- uint32_t rate;
+ atomic_long_t rate;
};
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 912e969fedba..907fa6c0d192 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -104,11 +104,56 @@ static void __update_writeback_rate(struct cached_dev *dc)
dc->writeback_rate_proportional = proportional_scaled;
dc->writeback_rate_integral_scaled = integral_scaled;
- dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
- dc->writeback_rate.rate = new_rate;
+ dc->writeback_rate_change = new_rate -
+ atomic_long_read(&dc->writeback_rate.rate);
+ atomic_long_set(&dc->writeback_rate.rate, new_rate);
dc->writeback_rate_target = target;
}
+static bool set_at_max_writeback_rate(struct cache_set *c,
+ struct cached_dev *dc)
+{
+ /*
+ * Idle_counter is increased everytime when update_writeback_rate() is
+ * called. If all backing devices attached to the same cache set have
+ * identical dc->writeback_rate_update_seconds values, it is about 6
+ * rounds of update_writeback_rate() on each backing device before
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+ * to each dc->writeback_rate.rate.
+ * In order to avoid extra locking cost for counting exact dirty cached
+ * devices number, c->attached_dev_nr is used to calculate the idle
+ * throushold. It might be bigger if not all cached device are in write-
+ * back mode, but it still works well with limited extra rounds of
+ * update_writeback_rate().
+ */
+ if (atomic_inc_return(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6)
+ return false;
+
+ if (atomic_read(&c->at_max_writeback_rate) != 1)
+ atomic_set(&c->at_max_writeback_rate, 1);
+
+ atomic_long_set(&dc->writeback_rate.rate, INT_MAX);
+
+ /* keep writeback_rate_target as existing value */
+ dc->writeback_rate_proportional = 0;
+ dc->writeback_rate_integral_scaled = 0;
+ dc->writeback_rate_change = 0;
+
+ /*
+ * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+ * new I/O arrives during before set_at_max_writeback_rate() returns.
+ * Then the writeback rate is set to 1, and its new value should be
+ * decided via __update_writeback_rate().
+ */
+ if ((atomic_read(&c->idle_counter) <
+ atomic_read(&c->attached_dev_nr) * 6) ||
+ !atomic_read(&c->at_max_writeback_rate))
+ return false;
+
+ return true;
+}
+
static void update_writeback_rate(struct work_struct *work)
{
struct cached_dev *dc = container_of(to_delayed_work(work),
@@ -136,13 +181,20 @@ static void update_writeback_rate(struct work_struct *work)
return;
}
- down_read(&dc->writeback_lock);
-
- if (atomic_read(&dc->has_dirty) &&
- dc->writeback_percent)
- __update_writeback_rate(dc);
+ if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+ /*
+ * If the whole cache set is idle, set_at_max_writeback_rate()
+ * will set writeback rate to a max number. Then it is
+ * unncessary to update writeback rate for an idle cache set
+ * in maximum writeback rate number(s).
+ */
+ if (!set_at_max_writeback_rate(c, dc)) {
+ down_read(&dc->writeback_lock);
+ __update_writeback_rate(dc);
+ up_read(&dc->writeback_lock);
+ }
+ }
- up_read(&dc->writeback_lock);
/*
* CACHE_SET_IO_DISABLE might be set via sysfs interface,
@@ -422,27 +474,6 @@ static void read_dirty(struct cached_dev *dc)
delay = writeback_delay(dc, size);
- /* If the control system would wait for at least half a
- * second, and there's been no reqs hitting the backing disk
- * for awhile: use an alternate mode where we have at most
- * one contiguous set of writebacks in flight at a time. If
- * someone wants to do IO it will be quick, as it will only
- * have to contend with one operation in flight, and we'll
- * be round-tripping data to the backing disk as quickly as
- * it can accept it.
- */
- if (delay >= HZ / 2) {
- /* 3 means at least 1.5 seconds, up to 7.5 if we
- * have slowed way down.
- */
- if (atomic_inc_return(&dc->backing_idle) >= 3) {
- /* Wait for current I/Os to finish */
- closure_sync(&cl);
- /* And immediately launch a new set. */
- delay = 0;
- }
- }
-
while (!kthread_should_stop() &&
!test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
delay) {
@@ -741,7 +772,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_running = true;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
- dc->writeback_rate.rate = 1024;
+ atomic_long_set(&dc->writeback_rate.rate, 1024);
dc->writeback_rate_minimum = 8;
dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
--
2.17.1
Some of the MSRs returned by GET_MSR_INDEX_LIST currently cannot be sent back
to KVM_GET_MSR and/or KVM_SET_MSR; either they can never be sent back, or you
they are only accepted under special conditions. This makes the API a pain to
use.
To avoid this pain, this patch makes it so that the result of the get-list
ioctl can always be used for host-initiated get and set. Since we don't have
a separate way to check for read-only MSRs, this means some Hyper-V MSRs are
ignored when written. Arguably they should not even be in the result of
GET_MSR_INDEX_LIST, but I am leaving there in case userspace is using the
outcome of GET_MSR_INDEX_LIST to derive the support for the corresponding
Hyper-V feature.
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kvm/hyperv.c | 27 ++++++++++++++++++++-------
arch/x86/kvm/hyperv.h | 2 +-
arch/x86/kvm/x86.c | 15 +++++++++------
3 files changed, 30 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index af8caf965baa..01d209ab5481 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
int ret;
- if (!synic->active)
+ if (!synic->active && !host)
return 1;
trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
return ret;
}
-static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
+ bool host)
{
int ret;
- if (!synic->active)
+ if (!synic->active && !host)
return 1;
ret = 0;
@@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
case HV_X64_MSR_TSC_EMULATION_STATUS:
hv->hv_tsc_emulation_status = data;
break;
+ case HV_X64_MSR_TIME_REF_COUNT:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
data, host);
}
+ case HV_X64_MSR_TSC_FREQUENCY:
+ case HV_X64_MSR_APIC_FREQUENCY:
+ /* read-only, but still ignore it if host-initiated */
+ if (!host)
+ return 1;
+ break;
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
+ bool host)
{
u64 data = 0;
struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
@@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_SIMP:
case HV_X64_MSR_EOM:
case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
- return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
+ return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
case HV_X64_MSR_STIMER0_CONFIG:
case HV_X64_MSR_STIMER1_CONFIG:
case HV_X64_MSR_STIMER2_CONFIG:
@@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return kvm_hv_set_msr(vcpu, msr, data, host);
}
-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
return r;
} else
- return kvm_hv_get_msr(vcpu, msr, pdata);
+ return kvm_hv_get_msr(vcpu, msr, pdata, host);
}
static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 837465d69c6d..d6aa969e20f1 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
}
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
bool kvm_hv_hypercall_enabled(struct kvm *kvm);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 153564db7980..f2876053e28b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2166,10 +2166,11 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.mcg_status = data;
break;
case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P))
+ if (!(mcg_cap & MCG_CTL_P) &&
+ (data || !msr_info->host_initiated))
return 1;
if (data != 0 && data != ~(u64)0)
- return -1;
+ return 1;
vcpu->arch.mcg_ctl = data;
break;
default:
@@ -2557,7 +2558,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
-static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
u64 data;
u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -2572,7 +2573,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = vcpu->arch.mcg_cap;
break;
case MSR_IA32_MCG_CTL:
- if (!(mcg_cap & MCG_CTL_P))
+ if (!(mcg_cap & MCG_CTL_P) && !host)
return 1;
data = vcpu->arch.mcg_ctl;
break;
@@ -2705,7 +2706,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -2726,7 +2728,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
return kvm_hv_get_msr_common(vcpu,
- msr_info->index, &msr_info->data);
+ msr_info->index, &msr_info->data,
+ msr_info->host_initiated);
break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
--
2.17.1