From: Herbert Xu herbert@gondor.apana.org.au
[ Upstream commit 882aa6525cabcfa0cea61e1a19c9af4c543118ac ]
Module device tables need to be declared as maybe_unused because they will be unused when built-in and the corresponding option is also disabled.
This patch adds the maybe_unused attributes to OF and ACPI. This also allows us to remove the ifdef around the ACPI data structure.
Reported-by: kernel test robot lkp@intel.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Reviewed-by: Vinod Koul vkoul@kernel.org Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/crypto/qcom-rng.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/crypto/qcom-rng.c b/drivers/crypto/qcom-rng.c index 031b5f701a0a..72dd1a4ebac4 100644 --- a/drivers/crypto/qcom-rng.c +++ b/drivers/crypto/qcom-rng.c @@ -9,6 +9,7 @@ #include <linux/crypto.h> #include <linux/io.h> #include <linux/iopoll.h> +#include <linux/kernel.h> #include <linux/module.h> #include <linux/of.h> #include <linux/platform_device.h> @@ -201,15 +202,13 @@ static int qcom_rng_remove(struct platform_device *pdev) return 0; }
-#if IS_ENABLED(CONFIG_ACPI) -static const struct acpi_device_id qcom_rng_acpi_match[] = { +static const struct acpi_device_id __maybe_unused qcom_rng_acpi_match[] = { { .id = "QCOM8160", .driver_data = 1 }, {} }; MODULE_DEVICE_TABLE(acpi, qcom_rng_acpi_match); -#endif
-static const struct of_device_id qcom_rng_of_match[] = { +static const struct of_device_id __maybe_unused qcom_rng_of_match[] = { { .compatible = "qcom,prng", .data = (void *)0}, { .compatible = "qcom,prng-ee", .data = (void *)1}, {}
From: Jarkko Sakkinen jarkko@profian.com
[ Upstream commit b3b9fdf1a9be4266b01a2063b1f37cdc20806e3b ]
A quirk for fixing the committed TCB version, when upgrading from a firmware version earlier than 1.50. This is a known issue, and the documented workaround is to load the firmware twice.
Currently, this issue requires the following workaround:
sudo modprobe -r kvm_amd sudo modprobe -r ccp sudo modprobe ccp sudo modprobe kvm_amd
Implement this workaround inside kernel by checking whether the API version is less than 1.50, and if so, download the firmware twice. This addresses the TCB version issue.
Link: https://lore.kernel.org/all/de02389f-249d-f565-1136-4af3655fab2a@profian.com... Reported-by: Harald Hoyer harald@profian.com Signed-off-by: Jarkko Sakkinen jarkko@profian.com Acked-by: Tom Lendacky thomas.lendacky@amd.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/crypto/ccp/sev-dev.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 9f588c9728f8..b292641c8a99 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -744,6 +744,11 @@ static int sev_update_firmware(struct device *dev) struct page *p; u64 data_size;
+ if (!sev_version_greater_or_equal(0, 15)) { + dev_dbg(dev, "DOWNLOAD_FIRMWARE not supported\n"); + return -1; + } + if (sev_get_firmware(dev, &firmware) == -ENOENT) { dev_dbg(dev, "No SEV firmware file present\n"); return -1; @@ -776,6 +781,14 @@ static int sev_update_firmware(struct device *dev) data->len = firmware->size;
ret = sev_do_cmd(SEV_CMD_DOWNLOAD_FIRMWARE, data, &error); + + /* + * A quirk for fixing the committed TCB version, when upgrading from + * earlier firmware version than 1.50. + */ + if (!ret && !sev_version_greater_or_equal(1, 50)) + ret = sev_do_cmd(SEV_CMD_DOWNLOAD_FIRMWARE, data, &error); + if (ret) dev_dbg(dev, "Failed to update SEV firmware: %#x\n", error); else @@ -1285,8 +1298,7 @@ void sev_pci_init(void) if (sev_get_api_version()) goto err;
- if (sev_version_greater_or_equal(0, 15) && - sev_update_firmware(sev->dev) == 0) + if (sev_update_firmware(sev->dev) == 0) sev_get_api_version();
/* If an init_ex_path is provided rely on INIT_EX for PSP initialization
From: Jacky Li jackyli@google.com
[ Upstream commit d8da2da21fdb1f5964c11c00f0cc84fb0edf31d0 ]
Currently the OS fails the PSP initialization when the file specified at 'init_ex_path' does not exist or has invalid content. However the SEV spec just requires users to allocate 32KB of 0xFF in the file, which can be taken care of by the OS easily.
To improve the robustness during the PSP init, leverage the retry mechanism and continue the init process:
Before the first INIT_EX call, if the content is invalid or missing, continue the process by feeding those contents into PSP instead of aborting. PSP will then override it with 32KB 0xFF and return SEV_RET_SECURE_DATA_INVALID status code. In the second INIT_EX call, this 32KB 0xFF content will then be fed and PSP will write the valid data to the file.
In order to do this, sev_read_init_ex_file should only be called once for the first INIT_EX call. Calling it again for the second INIT_EX call will cause the invalid file content overwriting the valid 32KB 0xFF data provided by PSP in the first INIT_EX call.
Co-developed-by: Peter Gonda pgonda@google.com Signed-off-by: Peter Gonda pgonda@google.com Signed-off-by: Jacky Li jackyli@google.com Reported-by: Alper Gun alpergun@google.com Acked-by: David Rientjes rientjes@google.com Acked-by: Tom Lendacky thomas.lendacky@amd.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org --- .../virt/kvm/x86/amd-memory-encryption.rst | 5 ++- drivers/crypto/ccp/sev-dev.c | 36 +++++++++++-------- 2 files changed, 24 insertions(+), 17 deletions(-)
diff --git a/Documentation/virt/kvm/x86/amd-memory-encryption.rst b/Documentation/virt/kvm/x86/amd-memory-encryption.rst index 2d307811978c..935aaeb97fe6 100644 --- a/Documentation/virt/kvm/x86/amd-memory-encryption.rst +++ b/Documentation/virt/kvm/x86/amd-memory-encryption.rst @@ -89,9 +89,8 @@ context. In a typical workflow, this command should be the first command issued.
The firmware can be initialized either by using its own non-volatile storage or the OS can manage the NV storage for the firmware using the module parameter -``init_ex_path``. The file specified by ``init_ex_path`` must exist. To create -a new NV storage file allocate the file with 32KB bytes of 0xFF as required by -the SEV spec. +``init_ex_path``. If the file specified by ``init_ex_path`` does not exist or +is invalid, the OS will create or override the file with output from PSP.
Returns: 0 on success, -negative on error
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index b292641c8a99..8512101f0bdf 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -211,18 +211,24 @@ static int sev_read_init_ex_file(void) if (IS_ERR(fp)) { int ret = PTR_ERR(fp);
- dev_err(sev->dev, - "SEV: could not open %s for read, error %d\n", - init_ex_path, ret); + if (ret == -ENOENT) { + dev_info(sev->dev, + "SEV: %s does not exist and will be created later.\n", + init_ex_path); + ret = 0; + } else { + dev_err(sev->dev, + "SEV: could not open %s for read, error %d\n", + init_ex_path, ret); + } return ret; }
nread = kernel_read(fp, sev_init_ex_buffer, NV_LENGTH, NULL); if (nread != NV_LENGTH) { - dev_err(sev->dev, - "SEV: failed to read %u bytes to non volatile memory area, ret %ld\n", + dev_info(sev->dev, + "SEV: could not read %u bytes to non volatile memory area, ret %ld\n", NV_LENGTH, nread); - return -EIO; }
dev_dbg(sev->dev, "SEV: read %ld bytes from NV file\n", nread); @@ -410,17 +416,12 @@ static int __sev_init_locked(int *error) static int __sev_init_ex_locked(int *error) { struct sev_data_init_ex data; - int ret;
memset(&data, 0, sizeof(data)); data.length = sizeof(data); data.nv_address = __psp_pa(sev_init_ex_buffer); data.nv_len = NV_LENGTH;
- ret = sev_read_init_ex_file(); - if (ret) - return ret; - if (sev_es_tmr) { /* * Do not include the encryption mask on the physical @@ -439,7 +440,7 @@ static int __sev_platform_init_locked(int *error) { struct psp_device *psp = psp_master; struct sev_device *sev; - int rc, psp_ret = -1; + int rc = 0, psp_ret = -1; int (*init_function)(int *error);
if (!psp || !psp->sev_data) @@ -450,8 +451,15 @@ static int __sev_platform_init_locked(int *error) if (sev->state == SEV_STATE_INIT) return 0;
- init_function = sev_init_ex_buffer ? __sev_init_ex_locked : - __sev_init_locked; + if (sev_init_ex_buffer) { + init_function = __sev_init_ex_locked; + rc = sev_read_init_ex_file(); + if (rc) + return rc; + } else { + init_function = __sev_init_locked; + } + rc = init_function(&psp_ret); if (rc && psp_ret == SEV_RET_SECURE_DATA_INVALID) { /*
From: Andreas Gruenbacher agruenba@redhat.com
[ Upstream commit 204c0300c4e99707e9fb6e57840aa1127060e63f ]
Switch from strlcpy to strscpy and make sure that @count is the size of the smaller of the source and destination buffers. This prevents reading beyond the end of the source buffer when the source string isn't null terminated.
Found by a modified version of syzkaller.
Suggested-by: Wolfram Sang wsa+renesas@sang-engineering.com Signed-off-by: Andreas Gruenbacher agruenba@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/gfs2/ops_fstype.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c9b423c874a3..ba4dfef0c15d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -383,8 +383,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent) if (!table[0]) table = sdp->sd_vfs->s_id;
- strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN); - strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN); + BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN); + + strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN); + strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN);
table = sdp->sd_table_name; while ((table = strchr(table, '/'))) @@ -1441,13 +1443,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (o) { case Opt_lockproto: - strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); break; case Opt_locktable: - strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); break; case Opt_hostdata: - strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); break; case Opt_spectator: args->ar_spectator = 1;
From: Marco Elver elver@google.com
[ Upstream commit f95e5a3d59011eec1257d0e76de1e1f8969d426f ]
Internal data structures (cpu_bps, task_bps) of powerpc's hw_breakpoint implementation have relied on nr_bp_mutex serializing access to them.
Before overhauling synchronization of kernel/events/hw_breakpoint.c, introduce 2 spinlocks to synchronize cpu_bps and task_bps respectively, thus avoiding reliance on callers synchronizing powerpc's hw_breakpoint.
Reported-by: Dmitry Vyukov dvyukov@google.com Signed-off-by: Marco Elver elver@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Dmitry Vyukov dvyukov@google.com Acked-by: Ian Rogers irogers@google.com Link: https://lore.kernel.org/r/20220829124719.675715-10-elver@google.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/kernel/hw_breakpoint.c | 53 ++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 13 deletions(-)
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 2669f80b3a49..8db1a15d7acb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/init.h>
@@ -129,7 +130,14 @@ struct breakpoint { bool ptrace_bp; };
+/* + * While kernel/events/hw_breakpoint.c does its own synchronization, we cannot + * rely on it safely synchronizing internals here; however, we can rely on it + * not requesting more breakpoints than available. + */ +static DEFINE_SPINLOCK(cpu_bps_lock); static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]); +static DEFINE_SPINLOCK(task_bps_lock); static LIST_HEAD(task_bps);
static struct breakpoint *alloc_breakpoint(struct perf_event *bp) @@ -174,7 +182,9 @@ static int task_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp);
+ spin_lock(&task_bps_lock); list_add(&tmp->list, &task_bps); + spin_unlock(&task_bps_lock); return 0; }
@@ -182,6 +192,7 @@ static void task_bps_remove(struct perf_event *bp) { struct list_head *pos, *q;
+ spin_lock(&task_bps_lock); list_for_each_safe(pos, q, &task_bps) { struct breakpoint *tmp = list_entry(pos, struct breakpoint, list);
@@ -191,6 +202,7 @@ static void task_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&task_bps_lock); }
/* @@ -200,12 +212,17 @@ static void task_bps_remove(struct perf_event *bp) static bool all_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false;
+ spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { - if (!can_co_exist(tmp, bp)) - return true; + if (!can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; }
/* @@ -215,13 +232,18 @@ static bool all_task_bps_check(struct perf_event *bp) static bool same_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false;
+ spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { if (tmp->bp->hw.target == bp->hw.target && - !can_co_exist(tmp, bp)) - return true; + !can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; }
static int cpu_bps_add(struct perf_event *bp) @@ -234,6 +256,7 @@ static int cpu_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp);
+ spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) { @@ -241,6 +264,7 @@ static int cpu_bps_add(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); return 0; }
@@ -249,6 +273,7 @@ static void cpu_bps_remove(struct perf_event *bp) struct breakpoint **cpu_bp; int i = 0;
+ spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) @@ -260,19 +285,25 @@ static void cpu_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); }
static bool cpu_bps_check(int cpu, struct perf_event *bp) { struct breakpoint **cpu_bp; + bool ret = false; int i;
+ spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, cpu); for (i = 0; i < nr_wp_slots(); i++) { - if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) - return true; + if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&cpu_bps_lock); + return ret; }
static bool all_cpu_bps_check(struct perf_event *bp) @@ -286,10 +317,6 @@ static bool all_cpu_bps_check(struct perf_event *bp) return false; }
-/* - * We don't use any locks to serialize accesses to cpu_bps or task_bps - * because are already inside nr_bp_mutex. - */ int arch_reserve_bp_slot(struct perf_event *bp) { int ret;
On Mon, 17 Oct 2022 at 17:08, Sasha Levin sashal@kernel.org wrote:
From: Marco Elver elver@google.com
[ Upstream commit f95e5a3d59011eec1257d0e76de1e1f8969d426f ]
Internal data structures (cpu_bps, task_bps) of powerpc's hw_breakpoint implementation have relied on nr_bp_mutex serializing access to them.
Before overhauling synchronization of kernel/events/hw_breakpoint.c, introduce 2 spinlocks to synchronize cpu_bps and task_bps respectively, thus avoiding reliance on callers synchronizing powerpc's hw_breakpoint.
Reported-by: Dmitry Vyukov dvyukov@google.com Signed-off-by: Marco Elver elver@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Dmitry Vyukov dvyukov@google.com Acked-by: Ian Rogers irogers@google.com Link: https://lore.kernel.org/r/20220829124719.675715-10-elver@google.com Signed-off-by: Sasha Levin sashal@kernel.org
Backporting this patch seems unnecessary if we're not backporting the hw_breakpoint overhaul.
Without the overhaul, nothing will break without this patch.
Thanks, -- Marco
arch/powerpc/kernel/hw_breakpoint.c | 53 ++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 13 deletions(-)
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 2669f80b3a49..8db1a15d7acb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/smp.h> +#include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/init.h>
@@ -129,7 +130,14 @@ struct breakpoint { bool ptrace_bp; };
+/*
- While kernel/events/hw_breakpoint.c does its own synchronization, we cannot
- rely on it safely synchronizing internals here; however, we can rely on it
- not requesting more breakpoints than available.
- */
+static DEFINE_SPINLOCK(cpu_bps_lock); static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]); +static DEFINE_SPINLOCK(task_bps_lock); static LIST_HEAD(task_bps);
static struct breakpoint *alloc_breakpoint(struct perf_event *bp) @@ -174,7 +182,9 @@ static int task_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp);
spin_lock(&task_bps_lock); list_add(&tmp->list, &task_bps);
spin_unlock(&task_bps_lock); return 0;
}
@@ -182,6 +192,7 @@ static void task_bps_remove(struct perf_event *bp) { struct list_head *pos, *q;
spin_lock(&task_bps_lock); list_for_each_safe(pos, q, &task_bps) { struct breakpoint *tmp = list_entry(pos, struct breakpoint, list);
@@ -191,6 +202,7 @@ static void task_bps_remove(struct perf_event *bp) break; } }
spin_unlock(&task_bps_lock);
}
/* @@ -200,12 +212,17 @@ static void task_bps_remove(struct perf_event *bp) static bool all_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp;
bool ret = false;
spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) {
if (!can_co_exist(tmp, bp))
return true;
if (!can_co_exist(tmp, bp)) {
ret = true;
break;
} }
return false;
spin_unlock(&task_bps_lock);
return ret;
}
/* @@ -215,13 +232,18 @@ static bool all_task_bps_check(struct perf_event *bp) static bool same_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp;
bool ret = false;
spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { if (tmp->bp->hw.target == bp->hw.target &&
!can_co_exist(tmp, bp))
return true;
!can_co_exist(tmp, bp)) {
ret = true;
break;
} }
return false;
spin_unlock(&task_bps_lock);
return ret;
}
static int cpu_bps_add(struct perf_event *bp) @@ -234,6 +256,7 @@ static int cpu_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp);
spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) {
@@ -241,6 +264,7 @@ static int cpu_bps_add(struct perf_event *bp) break; } }
spin_unlock(&cpu_bps_lock); return 0;
}
@@ -249,6 +273,7 @@ static void cpu_bps_remove(struct perf_event *bp) struct breakpoint **cpu_bp; int i = 0;
spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i])
@@ -260,19 +285,25 @@ static void cpu_bps_remove(struct perf_event *bp) break; } }
spin_unlock(&cpu_bps_lock);
}
static bool cpu_bps_check(int cpu, struct perf_event *bp) { struct breakpoint **cpu_bp;
bool ret = false; int i;
spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, cpu); for (i = 0; i < nr_wp_slots(); i++) {
if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp))
return true;
if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) {
ret = true;
break;
} }
return false;
spin_unlock(&cpu_bps_lock);
return ret;
}
static bool all_cpu_bps_check(struct perf_event *bp) @@ -286,10 +317,6 @@ static bool all_cpu_bps_check(struct perf_event *bp) return false; }
-/*
- We don't use any locks to serialize accesses to cpu_bps or task_bps
- because are already inside nr_bp_mutex.
- */
int arch_reserve_bp_slot(struct perf_event *bp) { int ret; -- 2.35.1
From: Tejun Heo tj@kernel.org
[ Upstream commit dc79ec1b232ad2c165d381d3dd2626df4ef9b5a4 ]
There's a seemingly harmless data-race around cgrp_dfl_visible detected by kernel concurrency sanitizer. Let's remove it by throwing WRITE/READ_ONCE at it.
Signed-off-by: Tejun Heo tj@kernel.org Reported-by: Abhishek Shah abhishek.shah@columbia.edu Cc: Gabriel Ryan gabe@cs.columbia.edu Reviewed-by: Christian Brauner (Microsoft) brauner@kernel.org Link: https://lore.kernel.org/netdev/20220819072256.fn7ctciefy4fc4cu@wittgenstein/ Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 80c23f48f3b4..1969ba9b4090 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2153,7 +2153,7 @@ static int cgroup_get_tree(struct fs_context *fc) struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret;
- cgrp_dfl_visible = true; + WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root;
@@ -6068,7 +6068,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible) + if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue;
seq_printf(m, "%d:", root->hierarchy_id);
From: Robin Murphy robin.murphy@arm.com
[ Upstream commit c919739ce4721ecf7b96b99253b032df30fcf19b ]
Currently we rely on registering all our instances before initially allowing any .probe_device calls via bus_set_iommu(). In preparation for phasing out the latter, make sure we won't inadvertently return success for a device associated with a known but not yet registered instance, otherwise we'll run straight into iommu_group_get_for_dev() trying to use NULL ops.
That also highlights an issue with intel_iommu_get_resv_regions() taking dmar_global_lock from within a section where intel_iommu_init() already holds it, which already exists via probe_acpi_namespace_devices() when an ANDD device is probed, but gets more obvious with the upcoming change to iommu_device_register(). Since they are both read locks it manages not to deadlock in practice, and a more in-depth rework of this locking is underway, so no attempt is made to address it here.
Reviewed-by: Kevin Tian kevin.tian@intel.com Signed-off-by: Robin Murphy robin.murphy@arm.com Link: https://lore.kernel.org/r/579f2692291bcbfc3ac64f7456fcff0d629af131.166057278... Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/iommu/intel/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3ed15e8ca677..690fad0d37f8 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4586,7 +4586,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) u8 bus, devfn;
iommu = device_to_iommu(dev, &bus, &devfn); - if (!iommu) + if (!iommu || !iommu->iommu.ops) return ERR_PTR(-ENODEV);
info = kzalloc(sizeof(*info), GFP_KERNEL);
From: Marek Bykowski marek.bykowski@gmail.com
[ Upstream commit d5e3050c0feb8bf7b9a75482fafcc31b90257926 ]
If the properties 'linux,initrd-start' and 'linux,initrd-end' of the chosen node populated from the bootloader, eg. U-Boot, are so that start > end, then the phys_initrd_size calculated from end - start is negative that subsequently gets converted to a high positive value for being unsigned long long. Then, the memory region with the (invalid) size is added to the bootmem and attempted being paged in paging_init() that results in the kernel fault.
For example, on the FVP ARM64 system I'm running, the U-Boot populates the 'linux,initrd-start' with 8800_0000 and 'linux,initrd-end' with 0. The phys_initrd_size calculated is then ffff_ffff_7800_0000 (= 0 - 8800_0000 = -8800_0000Â + ULLONG_MAXÂ + 1). paging_init() then attempts to map the address 8800_0000 + ffff_ffff_7800_0000 and oops'es as below.
It should be stressed, it is generally a fault of the bootloader's with the kernel relying on it, however we should not allow the bootloader's misconfiguration to lead to the kernel oops. Not only the kernel should be bullet proof against it but also finding the root cause of the paging fault spanning over the bootloader, DT, and kernel may happen is not so easy.
Unable to handle kernel paging request at virtual address fffffffefe43c000 Mem abort info: ESR = 0x96000007 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000007 CM = 0, WnR = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000080e3d000 [fffffffefe43c000] pgd=0000000080de9003, pud=0000000080de9003 Unable to handle kernel paging request at virtual address ffffff8000de9f90 Mem abort info: ESR = 0x96000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000005 CM = 0, WnR = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000080e3d000 [ffffff8000de9f90] pgd=0000000000000000, pud=0000000000000000 Internal error: Oops: 96000005 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.4.51-yocto-standard #1 Hardware name: FVP Base (DT) pstate: 60000085 (nZCv daIf -PAN -UAO) pc : show_pte+0x12c/0x1b4 lr : show_pte+0x100/0x1b4 sp : ffffffc010ce3b30 x29: ffffffc010ce3b30 x28: ffffffc010ceed80 x27: fffffffefe43c000 x26: fffffffefe43a028 x25: 0000000080bf0000 x24: 0000000000000025 x23: ffffffc010b8d000 x22: ffffffc010e3d000 x23: ffffffc010b8d000 x22: ffffffc010e3d000 x21: 0000000080de9000 x20: ffffff7f80000f90 x19: fffffffefe43c000 x18: 0000000000000030 x17: 0000000000001400 x16: 0000000000001c00 x15: ffffffc010cef1b8 x14: ffffffffffffffff x13: ffffffc010df1f40 x12: ffffffc010df1b70 x11: ffffffc010ce3b30 x10: ffffffc010ce3b30 x9 : 00000000ffffffc8 x8 : 0000000000000000 x7 : 000000000000000f x6 : ffffffc010df16e8 x5 : 0000000000000000 x4 : 0000000000000000 x3 : 00000000ffffffff x2 : 0000000000000000 x1 : 0000008080000000 x0 : ffffffc010af1d68 Call trace: show_pte+0x12c/0x1b4 die_kernel_fault+0x54/0x78 __do_kernel_fault+0x11c/0x128 do_translation_fault+0x58/0xac do_mem_abort+0x50/0xb0 el1_da+0x1c/0x90 __create_pgd_mapping+0x348/0x598 paging_init+0x3f0/0x70d0 setup_arch+0x2c0/0x5d4 start_kernel+0x94/0x49c Code: 92748eb5 900052a0 9135a000 cb010294 (f8756a96)
Signed-off-by: Marek Bykowski marek.bykowski@gmail.com Link: https://lore.kernel.org/r/20220909023358.76881-1-marek.bykowski@gmail.com Signed-off-by: Rob Herring robh@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/of/fdt.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 583ca847a39c..8b379ba7471b 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -937,6 +937,8 @@ static void __init early_init_dt_check_for_initrd(unsigned long node) if (!prop) return; end = of_read_number(prop, len/4); + if (start > end) + return;
__early_init_dt_declare_initrd(start, end); phys_initrd_start = start;
From: Peter Zijlstra peterz@infradead.org
[ Upstream commit 7a7621dfa417aa3715d2a3bd1bdd6cf5018274d0 ]
When 'discussing' control flow Masami mentioned the LOOP* instructions and I realized objtool doesn't decode them properly.
As it turns out, these instructions are somewhat inefficient and as such unlikely to be emitted by the compiler (a few vmlinux.o checks can't find a single one) so this isn't critical, but still, best to decode them properly.
Reported-by: Masami Hiramatsu (Google) mhiramat@kernel.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/Yxhd4EMKyoFoH9y4@hirez.programming.kicks-ass.net Signed-off-by: Sasha Levin sashal@kernel.org --- tools/objtool/arch/x86/decode.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index c260006106be..1c253b4b7ce0 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -635,6 +635,12 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec *type = INSN_CONTEXT_SWITCH; break;
+ case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + *type = INSN_JUMP_CONDITIONAL; + break; + case 0xe8: *type = INSN_CALL; /*
From: Gokul krishna Krishnakumar quic_gokukris@quicinc.com
[ Upstream commit 48dfb5d2560d36fb16c7d430c229d1604ea7d185 ]
Make the region inside the rwsem_write_trylock non preemptible.
We observe RT task is hogging CPU when trying to acquire rwsem lock which was acquired by a kworker task but before the rwsem owner was set.
Here is the scenario: 1. CFS task (affined to a particular CPU) takes rwsem lock.
2. CFS task gets preempted by a RT task before setting owner.
3. RT task (FIFO) is trying to acquire the lock, but spinning until RT throttling happens for the lock as the lock was taken by CFS task.
This patch attempts to fix the above issue by disabling preemption until owner is set for the lock. While at it also fix the issues at the places where rwsem_{set,clear}_owner() are called.
This also adds lockdep annotation of preemption disable in rwsem_{set,clear}_owner() on Peter Z. suggestion.
Signed-off-by: Gokul krishna Krishnakumar quic_gokukris@quicinc.com Signed-off-by: Mukesh Ojha quic_mojha@quicinc.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Waiman Long longman@redhat.com Link: https://lore.kernel.org/r/1662661467-24203-1-git-send-email-quic_mojha@quici... Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/locking/rwsem.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 65f0262f635e..44873594de03 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -133,14 +133,19 @@ * the owner value concurrently without lock. Read from owner, however, * may not need READ_ONCE() as long as the pointer value is only used * for comparison and isn't being dereferenced. + * + * Both rwsem_{set,clear}_owner() functions should be in the same + * preempt disable section as the atomic op that changes sem->count. */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, (long)current); }
static inline void rwsem_clear_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, 0); }
@@ -251,13 +256,16 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) static inline bool rwsem_write_trylock(struct rw_semaphore *sem) { long tmp = RWSEM_UNLOCKED_VALUE; + bool ret = false;
+ preempt_disable(); if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - return true; + ret = true; }
- return false; + preempt_enable(); + return ret; }
/* @@ -1352,8 +1360,10 @@ static inline void __up_write(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + preempt_enable(); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); }
From: Andrew Price anprice@redhat.com
[ Upstream commit 670f8ce56dd0632dc29a0322e188cc73ce3c6b92 ]
Fuzzers like to scribble over sb_bsize_shift but in reality it's very unlikely that this field would be corrupted on its own. Nevertheless it should be checked to avoid the possibility of messy mount errors due to bad calculations. It's always a fixed value based on the block size so we can just check that it's the expected value.
Tested with:
mkfs.gfs2 -O -p lock_nolock /dev/vdb for i in 0 -1 64 65 32 33; do gfs2_edit -p sb field sb_bsize_shift $i /dev/vdb mount /dev/vdb /mnt/test && umount /mnt/test done
Before this patch we get a withdraw after
[ 76.413681] gfs2: fsid=loop0.0: fatal: invalid metadata block [ 76.413681] bh = 19 (type: exp=5, found=4) [ 76.413681] function = gfs2_meta_buffer, file = fs/gfs2/meta_io.c, line = 492
and with UBSAN configured we also get complaints like
[ 76.373395] UBSAN: shift-out-of-bounds in fs/gfs2/ops_fstype.c:295:19 [ 76.373815] shift exponent 4294967287 is too large for 64-bit type 'long unsigned int'
After the patch, these complaints don't appear, mount fails immediately and we get an explanation in dmesg.
Reported-by: syzbot+dcf33a7aae997956fe06@syzkaller.appspotmail.com Signed-off-by: Andrew Price anprice@redhat.com Signed-off-by: Andreas Gruenbacher agruenba@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/gfs2/ops_fstype.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ba4dfef0c15d..890d1b5fbace 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -180,7 +180,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) pr_warn("Invalid block size\n"); return -EINVAL; } - + if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) { + pr_warn("Invalid block size shift\n"); + return -EINVAL; + } return 0; }
From: Yury Norov yury.norov@gmail.com
[ Upstream commit 546a073d628111e3338af689938407e77d5dc38f ]
generic_secondary_common_init() calls LOAD_REG_ADDR(r7, nr_cpu_ids) conditionally on CONFIG_SMP. However, if 'NR_CPUS == 1', kernel doesn't use the nr_cpu_ids, and in C code, it's just: #if NR_CPUS == 1 #define nr_cpu_ids ...
This series makes declaration of nr_cpu_ids conditional on NR_CPUS == 1, and that reveals the issue, because compiler can't link the LOAD_REG_ADDR(r7, nr_cpu_ids) against nonexisting symbol.
Current code looks unsafe for those who build kernel with CONFIG_SMP=y and NR_CPUS == 1. This is weird configuration, but not disallowed.
Fix the linker error by replacing LOAD_REG_ADDR() with LOAD_REG_IMMEDIATE() conditionally on NR_CPUS == 1.
As the following patch adds CONFIG_FORCE_NR_CPUS option that has the similar effect on nr_cpu_ids, make the generic_secondary_common_init() conditional on it too.
Reported-by: Stephen Rothwell sfr@canb.auug.org.au Signed-off-by: Yury Norov yury.norov@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/kernel/head_64.S | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index d3eea633d11a..8408d3f7f61f 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -400,8 +400,12 @@ generic_secondary_common_init: #else LOAD_REG_ADDR(r8, paca_ptrs) /* Load paca_ptrs pointe */ ld r8,0(r8) /* Get base vaddr of array */ +#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) + LOAD_REG_IMMEDIATE(r7, NR_CPUS) +#else LOAD_REG_ADDR(r7, nr_cpu_ids) /* Load nr_cpu_ids address */ lwz r7,0(r7) /* also the max paca allocated */ +#endif li r5,0 /* logical cpu id */ 1: sldi r9,r5,3 /* get paca_ptrs[] index from cpu id */
From: Greg Ungerer gerg@linux-m68k.org
[ Upstream commit 750321ace9107e103f254bf46900629ff347eb7b ]
Compiling for a classic m68k non-MMU target with no specific CPU selected fails with the following error:
arch/m68k/68000/ints.c: In function 'process_int':
arch/m68k/68000/ints.c:82:30: error: 'ISR' undeclared (first use in this function)
82 | unsigned long pend = ISR; | ^~~
This interrupt handling code is specific to the 68328 family of 68000 parts. There is a couple of variants (68EZ328, 68VZ328) and the common ancestor of them the strait 68328.
The code here includes a specific header for each variant type. But if none is selected then nothing is included to supply the appropriate register and bit flags defines.
Rearrange the includes so that at least one type is always included. At the very least the 68328 base type should be the fallback, so make that true.
Reported-by: kernel test robot lkp@intel.com Reviewed-by: Geert Uytterhoeven geert@linux-m68k.org Signed-off-by: Greg Ungerer gerg@linux-m68k.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/m68k/68000/ints.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/m68k/68000/ints.c b/arch/m68k/68000/ints.c index cda49b12d7be..f9a5ec781408 100644 --- a/arch/m68k/68000/ints.c +++ b/arch/m68k/68000/ints.c @@ -18,12 +18,12 @@ #include <asm/io.h> #include <asm/machdep.h>
-#if defined(CONFIG_M68328) -#include <asm/MC68328.h> -#elif defined(CONFIG_M68EZ328) +#if defined(CONFIG_M68EZ328) #include <asm/MC68EZ328.h> #elif defined(CONFIG_M68VZ328) #include <asm/MC68VZ328.h> +#else +#include <asm/MC68328.h> #endif
/* assembler routines */
From: Greg Ungerer gerg@linux-m68k.org
[ Upstream commit 18011e50c497f04a57a8e00122906f04922b30b4 ]
The family of classic 68000 parts supported when in non-mmu mode all currently use the legacy timer support. Move the selection of that config option, LEGACY_TIMER_TICK, into the core CPU configuration.
This fixes compilation if no specific CPU variant is selected, since the LEGACY_TIMER_TICK option was only selected in the specific CPU variant configurations.
Reviewed-by: Geert Uytterhoeven geert@linux-m68k.org Signed-off-by: Greg Ungerer gerg@linux-m68k.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/m68k/Kconfig.cpu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index f3aa44156969..3fdca29422e2 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -46,6 +46,7 @@ config M68000 select GENERIC_CSUM select CPU_NO_EFFICIENT_FFS select HAVE_ARCH_HASH + select LEGACY_TIMER_TICK help The Freescale (was Motorola) 68000 CPU is the first generation of the well known M68K family of processors. The CPU core as well as @@ -97,7 +98,6 @@ config M68060 config M68328 bool depends on !MMU - select LEGACY_TIMER_TICK select M68000 help Motorola 68328 processor support. @@ -105,7 +105,6 @@ config M68328 config M68EZ328 bool depends on !MMU - select LEGACY_TIMER_TICK select M68000 help Motorola 68EX328 processor support. @@ -113,7 +112,6 @@ config M68EZ328 config M68VZ328 bool depends on !MMU - select LEGACY_TIMER_TICK select M68000 help Motorola 68VZ328 processor support.
From: Dmitry Baryshkov dmitry.baryshkov@linaro.org
[ Upstream commit d7c6ea024c08bbdb799768f51ffd9fdd6236d190 ]
It is useful to be able to recheck dtbs files against a limited set of DT schema files. This can be accomplished by using differnt DT_SCHEMA_FILES argument values while rerunning make dtbs_check. However for some reason if_changed_rule doesn't pick up the rule_dtc changes (and doesn't retrigger the build).
Fix this by changing if_changed_rule to if_changed_dep and squashing DTC and dt-validate into a single new command. Then if_changed_dep triggers on DT_SCHEMA_FILES changes and reruns the build/check.
Signed-off-by: Dmitry Baryshkov dmitry.baryshkov@linaro.org Link: https://lore.kernel.org/r/20220915114422.79378-1-dmitry.baryshkov@linaro.org Signed-off-by: Rob Herring robh@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- scripts/Makefile.lib | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3fb6a99e78c4..cec0560f6ac6 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -371,17 +371,15 @@ DT_CHECKER_FLAGS ?= $(if $(DT_SCHEMA_FILES),-l $(DT_SCHEMA_FILES),-m) DT_BINDING_DIR := Documentation/devicetree/bindings DT_TMP_SCHEMA := $(objtree)/$(DT_BINDING_DIR)/processed-schema.json
-quiet_cmd_dtb_check = CHECK $@ - cmd_dtb_check = $(DT_CHECKER) $(DT_CHECKER_FLAGS) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) $@ || true +quiet_cmd_dtb = DTC_CHK $@ + cmd_dtb = $(cmd_dtc) ; $(DT_CHECKER) $(DT_CHECKER_FLAGS) -u $(srctree)/$(DT_BINDING_DIR) -p $(DT_TMP_SCHEMA) $@ || true +else +quiet_cmd_dtb = $(quiet_cmd_dtc) + cmd_dtb = $(cmd_dtc) endif
-define rule_dtc - $(call cmd_and_fixdep,dtc) - $(call cmd,dtb_check) -endef - $(obj)/%.dtb: $(src)/%.dts $(DTC) $(DT_TMP_SCHEMA) FORCE - $(call if_changed_rule,dtc) + $(call if_changed_dep,dtb)
$(obj)/%.dtbo: $(src)/%.dts $(DTC) FORCE $(call if_changed_dep,dtc)
From: Beau Belgrave beaub@linux.microsoft.com
[ Upstream commit 95f187603dbff69bef19b2ab3bb54d2c060f556d ]
import_single_range expects the direction/rw to be where it came from, not the protection/limit. Since the import is in a write path use WRITE.
Link: https://lkml.kernel.org/r/20220728233309.1896-3-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@...
Reported-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com Signed-off-by: Beau Belgrave beaub@linux.microsoft.com Signed-off-by: Steven Rostedt (Google) rostedt@goodmis.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/trace/trace_events_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 706e1686b5eb..4c9f6c776618 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1245,7 +1245,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT;
- if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + if (unlikely(import_single_range(WRITE, (char __user *)ubuf, + count, &iov, &i))) return -EFAULT;
return user_events_write_core(file, &i);
From: Beau Belgrave beaub@linux.microsoft.com
[ Upstream commit e6f89a149872ab0e03cfded97983df74dfb0ef21 ]
User processes can provide bad strings that may cause issues or leak kernel details back out. Don't trust the content of these strings when formatting strings for matching.
This also moves to a consistent dynamic length string creation model.
Link: https://lkml.kernel.org/r/20220728233309.1896-4-beaub@linux.microsoft.com Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@...
Reported-by: Mathieu Desnoyers mathieu.desnoyers@efficios.com Signed-off-by: Beau Belgrave beaub@linux.microsoft.com Signed-off-by: Steven Rostedt (Google) rostedt@goodmis.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/trace/trace_events_user.c | 91 +++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 32 deletions(-)
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 4c9f6c776618..29e16e1e5f11 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -45,7 +45,6 @@ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 -#define MAX_FIELD_ARG_NAME 256
static char *register_page_data;
@@ -483,6 +482,48 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func) }
#define LEN_OR_ZERO (len ? len - pos : 0) +static int user_dyn_field_set_string(int argc, const char **argv, int *iout, + char *buf, int len, bool *colon) +{ + int pos = 0, i = *iout; + + *colon = false; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + *colon = true; + break; + } + } + + /* Actual set, advance i */ + if (len != 0) + *iout = i; + + return pos + 1; +} + +static int user_field_set_string(struct ftrace_event_field *field, + char *buf, int len, bool colon) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + + if (colon) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + return pos + 1; +} + static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { struct ftrace_event_field *field, *next; @@ -926,49 +967,35 @@ static int user_event_free(struct dyn_event *ev) static bool user_field_match(struct ftrace_event_field *field, int argc, const char **argv, int *iout) { - char *field_name, *arg_name; - int len, pos, i = *iout; + char *field_name = NULL, *dyn_field_name = NULL; bool colon = false, match = false; + int dyn_len, len;
- if (i >= argc) + if (*iout >= argc) return false;
- len = MAX_FIELD_ARG_NAME; - field_name = kmalloc(len, GFP_KERNEL); - arg_name = kmalloc(len, GFP_KERNEL); + dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + 0, &colon);
- if (!arg_name || !field_name) - goto out; - - pos = 0; - - for (; i < argc; ++i) { - if (i != *iout) - pos += snprintf(arg_name + pos, len - pos, " "); + len = user_field_set_string(field, field_name, 0, colon);
- pos += snprintf(arg_name + pos, len - pos, argv[i]); - - if (strchr(argv[i], ';')) { - ++i; - colon = true; - break; - } - } + if (dyn_len != len) + return false;
- pos = 0; + dyn_field_name = kmalloc(dyn_len, GFP_KERNEL); + field_name = kmalloc(len, GFP_KERNEL);
- pos += snprintf(field_name + pos, len - pos, field->type); - pos += snprintf(field_name + pos, len - pos, " "); - pos += snprintf(field_name + pos, len - pos, field->name); + if (!dyn_field_name || !field_name) + goto out;
- if (colon) - pos += snprintf(field_name + pos, len - pos, ";"); + user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + dyn_len, &colon);
- *iout = i; + user_field_set_string(field, field_name, len, colon);
- match = strcmp(arg_name, field_name) == 0; + match = strcmp(dyn_field_name, field_name) == 0; out: - kfree(arg_name); + kfree(dyn_field_name); kfree(field_name);
return match;
From: Robin Murphy robin.murphy@arm.com
[ Upstream commit f1ad5338a4d57fe1fe6475003acb8c70bf9d1bdf ]
Commit 951d48855d86 ("of: Make of_dma_get_range() work on bus nodes") relaxed the handling of "dma-ranges" for any leaf node on the assumption that it would still represent a usage error for the property to be present on a non-bus leaf node. However there turns out to be a fiddly case where a bus also represents a DMA-capable device in its own right, such as a PCIe root complex with an integrated DMA engine on its platform side. In such cases, "dma-ranges" translation is entirely valid for devices discovered behind the bus, but should not be erroneously applied to the bus controller device itself which operates in its parent's address space. Fix this by restoring the previous behaviour for the specific case where a device is configured via its own OF node, since it is logical to assume that a device should never represent its own parent bus.
Reported-by: Serge Semin Sergey.Semin@baikalelectronics.ru Signed-off-by: Robin Murphy robin.murphy@arm.com Link: https://lore.kernel.org/r/112e8f3d3e7c054ecf5e12b5ac0aa5596ec00681.166445543... Signed-off-by: Rob Herring robh@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/of/address.c | 4 +++- drivers/of/device.c | 9 ++++++++- drivers/of/of_private.h | 5 +++++ 3 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/drivers/of/address.c b/drivers/of/address.c index 94f017d808c4..d6f5a427a329 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -579,7 +579,8 @@ u64 of_translate_address(struct device_node *dev, const __be32 *in_addr) } EXPORT_SYMBOL(of_translate_address);
-static struct device_node *__of_get_dma_parent(const struct device_node *np) +#ifdef CONFIG_HAS_DMA +struct device_node *__of_get_dma_parent(const struct device_node *np) { struct of_phandle_args args; int ret, index; @@ -596,6 +597,7 @@ static struct device_node *__of_get_dma_parent(const struct device_node *np)
return of_node_get(args.np); } +#endif
static struct device_node *of_get_next_dma_parent(struct device_node *np) { diff --git a/drivers/of/device.c b/drivers/of/device.c index 75b6cbffa755..8cefe5a7d04e 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -116,12 +116,19 @@ int of_dma_configure_id(struct device *dev, struct device_node *np, { const struct iommu_ops *iommu; const struct bus_dma_region *map = NULL; + struct device_node *bus_np; u64 dma_start = 0; u64 mask, end, size = 0; bool coherent; int ret;
- ret = of_dma_get_range(np, &map); + if (np == dev->of_node) + bus_np = __of_get_dma_parent(np); + else + bus_np = of_node_get(np); + + ret = of_dma_get_range(bus_np, &map); + of_node_put(bus_np); if (ret < 0) { /* * For legacy reasons, we have to assume some devices need diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index 9324483397f6..fb6792d381a6 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -155,12 +155,17 @@ struct bus_dma_region; #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA) int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map); +struct device_node *__of_get_dma_parent(const struct device_node *np); #else static inline int of_dma_get_range(struct device_node *np, const struct bus_dma_region **map) { return -ENODEV; } +static inline struct device_node *__of_get_dma_parent(const struct device_node *np) +{ + return of_get_parent(np); +} #endif
void fdt_init_reserved_mem(void);
From: Zhao Liu zhao1.liu@intel.com
[ Upstream commit 154fb14df7a3c81dea82eca7c0c46590f5ffc3d2 ]
kmap() is being deprecated in favor of kmap_local_page()[1].
There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap's pool wraps and it might block when the mapping space is fully utilized until a slot becomes available.
With kmap_local_page() the mappings are per thread, CPU local, can take page faults, and can be called from any context (including interrupts). It is faster than kmap() in kernels with HIGHMEM enabled. Furthermore, the tasks can be preempted and, when they are scheduled to run again, the kernel virtual addresses are restored and are still valid.
In the fuction hyperv_init() of hyperv/hv_init.c, the mapping is used in a single thread and is short live. So, in this case, it's safe to simply use kmap_local_page() to create mapping, and this avoids the wasted cost of kmap() for global synchronization.
In addtion, the fuction hyperv_init() checks if kmap() fails by BUG_ON(). From the original discussion[2], the BUG_ON() here is just used to explicitly panic NULL pointer. So still keep the BUG_ON() in place to check if kmap_local_page() fails. Based on this consideration, memcpy_to_page() is not selected here but only kmap_local_page() is used.
Therefore, replace kmap() with kmap_local_page() in hyperv/hv_init.c.
[1]: https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com [2]: https://lore.kernel.org/lkml/20200915103710.cqmdvzh5lys4wsqo@liuwe-devbox-de...
Suggested-by: Dave Hansen dave.hansen@intel.com Suggested-by: Ira Weiny ira.weiny@intel.com Suggested-by: Fabio M. De Francesco fmdefrancesco@gmail.com Signed-off-by: Zhao Liu zhao1.liu@intel.com Link: https://lore.kernel.org/r/20220928095640.626350-1-zhao1.liu@linux.intel.com Signed-off-by: Wei Liu wei.liu@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/hyperv/hv_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 3de6d8b53367..72fe46eb183f 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -459,13 +459,13 @@ void __init hyperv_init(void) wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
pg = vmalloc_to_page(hv_hypercall_pg); - dst = kmap(pg); + dst = kmap_local_page(pg); src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, MEMREMAP_WB); BUG_ON(!(src && dst)); memcpy(dst, src, HV_HYP_PAGE_SIZE); memunmap(src); - kunmap(pg); + kunmap_local(dst); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
From: Alexander Potapenko glider@google.com
[ Upstream commit 79dbd006a6d6f51777ba4948046561b6d9270504 ]
EFI stub cannot be linked with KMSAN runtime, so we disable instrumentation for it.
Instrumenting kcov, stackdepot or lockdep leads to infinite recursion caused by instrumentation hooks calling instrumented code again.
Link: https://lkml.kernel.org/r/20220915150417.722975-13-glider@google.com Signed-off-by: Alexander Potapenko glider@google.com Reviewed-by: Marco Elver elver@google.com Cc: Alexander Viro viro@zeniv.linux.org.uk Cc: Alexei Starovoitov ast@kernel.org Cc: Andrey Konovalov andreyknvl@gmail.com Cc: Andrey Konovalov andreyknvl@google.com Cc: Andy Lutomirski luto@kernel.org Cc: Arnd Bergmann arnd@arndb.de Cc: Borislav Petkov bp@alien8.de Cc: Christoph Hellwig hch@lst.de Cc: Christoph Lameter cl@linux.com Cc: David Rientjes rientjes@google.com Cc: Dmitry Vyukov dvyukov@google.com Cc: Eric Biggers ebiggers@google.com Cc: Eric Biggers ebiggers@kernel.org Cc: Eric Dumazet edumazet@google.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Herbert Xu herbert@gondor.apana.org.au Cc: Ilya Leoshkevich iii@linux.ibm.com Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Kees Cook keescook@chromium.org Cc: Mark Rutland mark.rutland@arm.com Cc: Matthew Wilcox willy@infradead.org Cc: Michael S. Tsirkin mst@redhat.com Cc: Pekka Enberg penberg@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Petr Mladek pmladek@suse.com Cc: Stephen Rothwell sfr@canb.auug.org.au Cc: Steven Rostedt rostedt@goodmis.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Vasily Gorbik gor@linux.ibm.com Cc: Vegard Nossum vegard.nossum@oracle.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/firmware/efi/libstub/Makefile | 1 + kernel/Makefile | 1 + kernel/locking/Makefile | 3 ++- lib/Makefile | 3 +++ 4 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 2c67f71f2375..2c1eb1fb0f22 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -53,6 +53,7 @@ GCOV_PROFILE := n # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y
diff --git a/kernel/Makefile b/kernel/Makefile index a7e1f49ab2b3..e47f0526c987 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n +KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
# Don't instrument error handlers diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index d51cabf28f38..ea925731fa40 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n +KMSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/lib/Makefile b/lib/Makefile index f99bf61f8bbc..73fea85b7636 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -272,6 +272,9 @@ obj-$(CONFIG_POLYNOMIAL) += polynomial.o CFLAGS_stackdepot.o += -fno-builtin obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +# In particular, instrumenting stackdepot.c with KMSAN will result in infinite +# recursion. +KMSAN_SANITIZE_stackdepot.o := n KCOV_INSTRUMENT_stackdepot.o := n
obj-$(CONFIG_REF_TRACKER) += ref_tracker.o
From: Alexander Potapenko glider@google.com
[ Upstream commit f630a5d0ca59a6e73b61e3f82c371dc230da99ff ]
KMSAN metadata for adjacent physical pages may not be adjacent, therefore accessing such pages together may lead to metadata corruption. We disable merging pages in biovec to prevent such corruptions.
Link: https://lkml.kernel.org/r/20220915150417.722975-28-glider@google.com Signed-off-by: Alexander Potapenko glider@google.com Cc: Alexander Viro viro@zeniv.linux.org.uk Cc: Alexei Starovoitov ast@kernel.org Cc: Andrey Konovalov andreyknvl@gmail.com Cc: Andrey Konovalov andreyknvl@google.com Cc: Andy Lutomirski luto@kernel.org Cc: Arnd Bergmann arnd@arndb.de Cc: Borislav Petkov bp@alien8.de Cc: Christoph Hellwig hch@lst.de Cc: Christoph Lameter cl@linux.com Cc: David Rientjes rientjes@google.com Cc: Dmitry Vyukov dvyukov@google.com Cc: Eric Biggers ebiggers@google.com Cc: Eric Biggers ebiggers@kernel.org Cc: Eric Dumazet edumazet@google.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Herbert Xu herbert@gondor.apana.org.au Cc: Ilya Leoshkevich iii@linux.ibm.com Cc: Ingo Molnar mingo@redhat.com Cc: Jens Axboe axboe@kernel.dk Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Kees Cook keescook@chromium.org Cc: Marco Elver elver@google.com Cc: Mark Rutland mark.rutland@arm.com Cc: Matthew Wilcox willy@infradead.org Cc: Michael S. Tsirkin mst@redhat.com Cc: Pekka Enberg penberg@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Petr Mladek pmladek@suse.com Cc: Stephen Rothwell sfr@canb.auug.org.au Cc: Steven Rostedt rostedt@goodmis.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Vasily Gorbik gor@linux.ibm.com Cc: Vegard Nossum vegard.nossum@oracle.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- block/blk.h | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/block/blk.h b/block/blk.h index 0d6668663ab5..37d24fab1e30 100644 --- a/block/blk.h +++ b/block/blk.h @@ -93,6 +93,13 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
+ /* + * Merging adjacent physical pages may not work correctly under KMSAN + * if their metadata pages aren't adjacent. Just disable merging. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
From: Shuqi Zhang zhangshuqi3@huawei.com
[ Upstream commit 9b7eadd9bd3a0cc24533a23d83c46430a0ea60ff ]
This is a BUG_ON issue as follows when running xfstest-generic-503: WARNING: CPU: 21 PID: 1385 at fs/f2fs/inode.c:762 f2fs_evict_inode+0x847/0xaa0 Modules linked in: CPU: 21 PID: 1385 Comm: umount Not tainted 5.19.0-rc5+ #73 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-4.fc34 04/01/2014
Call Trace: evict+0x129/0x2d0 dispose_list+0x4f/0xb0 evict_inodes+0x204/0x230 generic_shutdown_super+0x5b/0x1e0 kill_block_super+0x29/0x80 kill_f2fs_super+0xe6/0x140 deactivate_locked_super+0x44/0xc0 deactivate_super+0x79/0x90 cleanup_mnt+0x114/0x1a0 __cleanup_mnt+0x16/0x20 task_work_run+0x98/0x100 exit_to_user_mode_prepare+0x3d0/0x3e0 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x42/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0
Function flow analysis when BUG occurs: f2fs_fallocate mmap do_page_fault pte_spinlock // ---lock_pte do_wp_page wp_page_shared pte_unmap_unlock // unlock_pte do_page_mkwrite f2fs_vm_page_mkwrite down_read(invalidate_lock) lock_page if (PageMappedToDisk(page)) goto out; // set_page_dirty --NOT RUN out: up_read(invalidate_lock); finish_mkwrite_fault // unlock_pte f2fs_collapse_range down_write(i_mmap_sem) truncate_pagecache unmap_mapping_pages i_mmap_lock_write // down_write(i_mmap_rwsem) ...... zap_pte_range pte_offset_map_lock // ---lock_pte set_page_dirty f2fs_dirty_data_folio if (!folio_test_dirty(folio)) { fault_dirty_shared_page set_page_dirty f2fs_dirty_data_folio if (!folio_test_dirty(folio)) { filemap_dirty_folio f2fs_update_dirty_folio // ++ } unlock_page filemap_dirty_folio f2fs_update_dirty_folio // page count++ } pte_unmap_unlock // --unlock_pte i_mmap_unlock_write // up_write(i_mmap_rwsem) truncate_inode_pages up_write(i_mmap_sem)
When race happens between mmap-do_page_fault-wp_page_shared and fallocate-truncate_pagecache-zap_pte_range, the zap_pte_range calls function set_page_dirty without page lock. Besides, though truncate_pagecache has immap and pte lock, wp_page_shared calls fault_dirty_shared_page without any. In this case, two threads race in f2fs_dirty_data_folio function. Page is set to dirty only ONCE, but the count is added TWICE by calling filemap_dirty_folio. Thus the count of dirty page cannot accord with the real dirty pages.
Following is the solution to in case of race happens without any lock. Since folio_test_set_dirty in filemap_dirty_folio is atomic, judge return value will not be at risk of race.
Signed-off-by: Shuqi Zhang zhangshuqi3@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/checkpoint.c | 3 +-- fs/f2fs/data.c | 3 +-- fs/f2fs/node.c | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6d8b2bf14de0..01d3eede0b29 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -448,8 +448,7 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping,
if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f2a272613477..9157d09dde6d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3681,8 +3681,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping, folio_mark_uptodate(folio); BUG_ON(folio_test_swapcache(folio));
- if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { f2fs_update_dirty_folio(inode, folio); return true; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 02e92a72511b..3f508b13cb61 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2151,8 +2151,7 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, if (IS_INODE(&folio->page)) f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page); #endif - if (!folio_test_dirty(folio)) { - filemap_dirty_folio(mapping, folio); + if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); set_page_private_reference(&folio->page); return true;
From: Zhang Qilong zhangqilong3@huawei.com
[ Upstream commit 544b53dadc208278fd0796f2c22ea24a3fe16564 ]
ERROR: code indent should use tabs where possible ERROR: spaces required around that ':' ERROR: incorrect tab
Found serveral code type errors when review the code and fix it. There is no function change.
Signed-off-by: Zhang Qilong zhangqilong3@huawei.com Reviewed-by: Chao Yu chao@kernel.org Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/data.c | 2 +- fs/f2fs/debug.c | 2 +- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/node.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9157d09dde6d..cb8260b1cb4e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -707,7 +707,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ? - __read_io_type(page): WB_DATA_TYPE(fio->page)); + __read_io_type(page) : WB_DATA_TYPE(fio->page));
__submit_bio(fio->sbi, bio, fio->type); return 0; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c92625ef16d0..9bac409d31bd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -347,7 +347,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n", si->sbi->sb->s_bdev, i++, - f2fs_readonly(si->sbi->sb) ? "RO": "RW", + f2fs_readonly(si->sbi->sb) ? "RO" : "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); if (si->sbi->s_flag) { diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 866e72b29bd5..d6b4a1c8ed2f 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -583,7 +583,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, org_end = dei.fofs + dei.len; f2fs_bug_on(sbi, pos >= org_end);
- if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) { en->ei.len = pos - en->ei.fofs; prev_en = en; parts = 1; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ecd833ba35fc..36e2d0226be4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4627,7 +4627,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) skip_write_trace: /* Do the actual write. */ ret = dio ? - f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_dio_write_iter(iocb, from, &may_need_sync) : f2fs_buffered_write_iter(iocb, from);
if (trace_f2fs_datawrite_end_enabled()) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3f508b13cb61..c9b10113fcb1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -585,7 +585,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - up_read(&curseg->journal_rwsem); + up_read(&curseg->journal_rwsem); if (i >= 0) { f2fs_up_read(&nm_i->nat_tree_lock); goto cache;
From: Chao Yu chao@kernel.org
[ Upstream commit fcc2d8cc96b2f6141bbbe5b1e8953db990794b44 ]
It is possible that ino of dirent or orphan inode is corrupted in a fuzzed image, occasionally, if corrupted ino is equal to meta ino: meta_ino, node_ino or compress_ino, caller of f2fs_iget() from below call paths will get meta inode directly, it's not allowed, let's add sanity check to detect such cases.
case #1 - recover_dentry - __f2fs_find_entry - f2fs_iget_retry
case #2 - recover_orphan_inode - f2fs_iget_retry
Signed-off-by: Chao Yu chao@kernel.org Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/inode.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index fc55f5bd1fcc..e27968661bd7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -480,6 +480,12 @@ static int do_read_inode(struct inode *inode) return 0; }
+static bool is_meta_ino(struct f2fs_sb_info *sbi, unsigned int ino) +{ + return ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi) || + ino == F2FS_COMPRESS_INO(sbi); +} + struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -491,16 +497,21 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) { + if (is_meta_ino(sbi, ino)) { + f2fs_err(sbi, "inaccessible inode: %lu, run fsck to repair", ino); + set_sbi_flag(sbi, SBI_NEED_FSCK); + ret = -EFSCORRUPTED; + trace_f2fs_iget_exit(inode, ret); + iput(inode); + return ERR_PTR(ret); + } + trace_f2fs_iget(inode); return inode; } - if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi)) - goto make_now;
-#ifdef CONFIG_F2FS_FS_COMPRESSION - if (ino == F2FS_COMPRESS_INO(sbi)) + if (is_meta_ino(sbi, ino)) goto make_now; -#endif
ret = do_read_inode(inode); if (ret)
From: Dominique Martinet asmadeus@codewreck.org
[ Upstream commit 52f1c45dde9136f964d63a77d19826c8a74e2c7f ]
syzbot reported a double-lock here and we no longer need this lock after requests have been moved off to local list: just drop the lock earlier.
Link: https://lkml.kernel.org/r/20220904064028.1305220-1-asmadeus@codewreck.org Reported-by: syzbot+50f7e8d06c3768dd97f3@syzkaller.appspotmail.com Signed-off-by: Dominique Martinet asmadeus@codewreck.org Tested-by: Schspa Shi schspa@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- net/9p/trans_fd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index e758978b44be..60fcc6b30b46 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -205,6 +205,8 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_move(&req->req_list, &cancel_list); }
+ spin_unlock(&m->client->lock); + list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); list_del(&req->req_list); @@ -212,7 +214,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err) req->t_err = err; p9_client_cb(m->client, req, REQ_STATUS_ERROR); } - spin_unlock(&m->client->lock); }
static __poll_t
From: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp
[ Upstream commit ef575281b21e9a34dfae544a187c6aac2ae424a9 ]
syzbot is reporting hung task at p9_fd_close() [1], for p9_mux_poll_stop() from p9_conn_destroy() from p9_fd_close() is failing to interrupt already started kernel_read() from p9_fd_read() from p9_read_work() and/or kernel_write() from p9_fd_write() from p9_write_work() requests.
Since p9_socket_open() sets O_NONBLOCK flag, p9_mux_poll_stop() does not need to interrupt kernel_read()/kernel_write(). However, since p9_fd_open() does not set O_NONBLOCK flag, but pipe blocks unless signal is pending, p9_mux_poll_stop() needs to interrupt kernel_read()/kernel_write() when the file descriptor refers to a pipe. In other words, pipe file descriptor needs to be handled as if socket file descriptor.
We somehow need to interrupt kernel_read()/kernel_write() on pipes.
A minimal change, which this patch is doing, is to set O_NONBLOCK flag from p9_fd_open(), for O_NONBLOCK flag does not affect reading/writing of regular files. But this approach changes O_NONBLOCK flag on userspace- supplied file descriptors (which might break userspace programs), and O_NONBLOCK flag could be changed by userspace. It would be possible to set O_NONBLOCK flag every time p9_fd_read()/p9_fd_write() is invoked, but still remains small race window for clearing O_NONBLOCK flag.
If we don't want to manipulate O_NONBLOCK flag, we might be able to surround kernel_read()/kernel_write() with set_thread_flag(TIF_SIGPENDING) and recalc_sigpending(). Since p9_read_work()/p9_write_work() works are processed by kernel threads which process global system_wq workqueue, signals could not be delivered from remote threads when p9_mux_poll_stop() from p9_conn_destroy() from p9_fd_close() is called. Therefore, calling set_thread_flag(TIF_SIGPENDING)/recalc_sigpending() every time would be needed if we count on signals for making kernel_read()/kernel_write() non-blocking.
Link: https://lkml.kernel.org/r/345de429-a88b-7097-d177-adecf9fed342@I-love.SAKURA... Link: https://syzkaller.appspot.com/bug?extid=8b41a1365f1106fd0f33 [1] Reported-by: syzbot syzbot+8b41a1365f1106fd0f33@syzkaller.appspotmail.com Signed-off-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Tested-by: syzbot syzbot+8b41a1365f1106fd0f33@syzkaller.appspotmail.com Reviewed-by: Christian Schoenebeck linux_oss@crudebyte.com [Dominique: add comment at Christian's suggestion] Signed-off-by: Dominique Martinet asmadeus@codewreck.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/9p/trans_fd.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 60fcc6b30b46..90f8642a7cf3 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -822,11 +822,14 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd) goto out_free_ts; if (!(ts->rd->f_mode & FMODE_READ)) goto out_put_rd; + /* prevent workers from hanging on IO when fd is a pipe */ + ts->rd->f_flags |= O_NONBLOCK; ts->wr = fget(wfd); if (!ts->wr) goto out_put_rd; if (!(ts->wr->f_mode & FMODE_WRITE)) goto out_put_wr; + ts->wr->f_flags |= O_NONBLOCK;
client->trans = ts; client->status = Connected;
From: Dominique Martinet asmadeus@codewreck.org
[ Upstream commit 296ab4a813841ba1d5f40b03190fd1bd8f25aab0 ]
Shamelessly copying the explanation from Tetsuo Handa's suggested patch[1] (slightly reworded): syzbot is reporting inconsistent lock state in p9_req_put()[2], for p9_tag_remove() from p9_req_put() from IRQ context is using spin_lock_irqsave() on "struct p9_client"->lock but trans_fd (not from IRQ context) is using spin_lock().
Since the locks actually protect different things in client.c and in trans_fd.c, just replace trans_fd.c's lock by a new one specific to the transport (client.c's protect the idr for fid/tag allocations, while trans_fd.c's protects its own req list and request status field that acts as the transport's state machine)
Link: https://lore.kernel.org/r/20220904112928.1308799-1-asmadeus@codewreck.org Link: https://lkml.kernel.org/r/2470e028-9b05-2013-7198-1fdad071d999@I-love.SAKURA... [1] Link: https://syzkaller.appspot.com/bug?extid=2f20b523930c32c160cc [2] Reported-by: syzbot syzbot+2f20b523930c32c160cc@syzkaller.appspotmail.com Reported-by: Tetsuo Handa penguin-kernel@I-love.SAKURA.ne.jp Reviewed-by: Christian Schoenebeck linux_oss@crudebyte.com Signed-off-by: Dominique Martinet asmadeus@codewreck.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/9p/trans_fd.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-)
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 90f8642a7cf3..0191f22d1ec3 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -91,6 +91,7 @@ struct p9_poll_wait { * @mux_list: list link for mux to manage multiple connections (?) * @client: reference to client instance for this connection * @err: error state + * @req_lock: lock protecting req_list and requests statuses * @req_list: accounting for requests which have been sent * @unsent_req_list: accounting for requests that haven't been sent * @rreq: read request @@ -114,6 +115,7 @@ struct p9_conn { struct list_head mux_list; struct p9_client *client; int err; + spinlock_t req_lock; struct list_head req_list; struct list_head unsent_req_list; struct p9_req_t *rreq; @@ -189,10 +191,10 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
- spin_lock(&m->client->lock); + spin_lock(&m->req_lock);
if (m->err) { - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); return; }
@@ -205,7 +207,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_move(&req->req_list, &cancel_list); }
- spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock);
list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); @@ -360,7 +362,7 @@ static void p9_read_work(struct work_struct *work) if ((m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); m->rreq->rc.size = m->rc.offset; - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (m->rreq->status == REQ_STATUS_SENT) { list_del(&m->rreq->req_list); p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD); @@ -369,14 +371,14 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "Ignore replies associated with a cancelled request\n"); } else { - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); p9_debug(P9_DEBUG_ERROR, "Request tag %d errored out while we were reading the reply\n", m->rc.tag); err = -EIO; goto error; } - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); m->rc.sdata = NULL; m->rc.offset = 0; m->rc.capacity = 0; @@ -454,10 +456,10 @@ static void p9_write_work(struct work_struct *work) }
if (!m->wsize) { - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (list_empty(&m->unsent_req_list)) { clear_bit(Wworksched, &m->wsched); - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); return; }
@@ -472,7 +474,7 @@ static void p9_write_work(struct work_struct *work) m->wpos = 0; p9_req_get(req); m->wreq = req; - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); }
p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n", @@ -589,6 +591,7 @@ static void p9_conn_create(struct p9_client *client) INIT_LIST_HEAD(&m->mux_list); m->client = client;
+ spin_lock_init(&m->req_lock); INIT_LIST_HEAD(&m->req_list); INIT_LIST_HEAD(&m->unsent_req_list); INIT_WORK(&m->rq, p9_read_work); @@ -670,10 +673,10 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) if (m->err < 0) return m->err;
- spin_lock(&client->lock); + spin_lock(&m->req_lock); req->status = REQ_STATUS_UNSENT; list_add_tail(&req->req_list, &m->unsent_req_list); - spin_unlock(&client->lock); + spin_unlock(&m->req_lock);
if (test_and_clear_bit(Wpending, &m->wsched)) n = EPOLLOUT; @@ -688,11 +691,13 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) { + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; int ret = 1;
p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
- spin_lock(&client->lock); + spin_lock(&m->req_lock);
if (req->status == REQ_STATUS_UNSENT) { list_del(&req->req_list); @@ -700,21 +705,24 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) p9_req_put(client, req); ret = 0; } - spin_unlock(&client->lock); + spin_unlock(&m->req_lock);
return ret; }
static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) { + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; + p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
- spin_lock(&client->lock); + spin_lock(&m->req_lock); /* Ignore cancelled request if message has been received * before lock. */ if (req->status == REQ_STATUS_RCVD) { - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); return 0; }
@@ -723,7 +731,8 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) */ list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); + p9_req_put(client, req);
return 0;
From: Angus Chen angus.chen@jaguarmicro.com
[ Upstream commit 71491c54eafa318fdd24a1f26a1c82b28e1ac21d ]
The background is that we use dpu in cloud computing,the arch is x86,80 cores. We will have a lots of virtio devices,like 512 or more. When we probe about 200 virtio_blk devices,it will fail and the stack is printed as follows:
[25338.485128] virtio-pci 0000:b3:00.0: virtio_pci: leaving for legacy driver [25338.496174] genirq: Flags mismatch irq 0. 00000080 (virtio418) vs. 00015a00 (timer) [25338.503822] CPU: 20 PID: 5431 Comm: kworker/20:0 Kdump: loaded Tainted: G OE --------- - - 4.18.0-305.30.1.el8.x86_64 [25338.516403] Hardware name: Inspur NF5280M5/YZMB-00882-10E, BIOS 4.1.21 08/25/2021 [25338.523881] Workqueue: events work_for_cpu_fn [25338.528235] Call Trace: [25338.530687] dump_stack+0x5c/0x80 [25338.534000] __setup_irq.cold.53+0x7c/0xd3 [25338.538098] request_threaded_irq+0xf5/0x160 [25338.542371] vp_find_vqs+0xc7/0x190 [25338.545866] init_vq+0x17c/0x2e0 [virtio_blk] [25338.550223] ? ncpus_cmp_func+0x10/0x10 [25338.554061] virtblk_probe+0xe6/0x8a0 [virtio_blk] [25338.558846] virtio_dev_probe+0x158/0x1f0 [25338.562861] really_probe+0x255/0x4a0 [25338.566524] ? __driver_attach_async_helper+0x90/0x90 [25338.571567] driver_probe_device+0x49/0xc0 [25338.575660] bus_for_each_drv+0x79/0xc0 [25338.579499] __device_attach+0xdc/0x160 [25338.583337] bus_probe_device+0x9d/0xb0 [25338.587167] device_add+0x418/0x780 [25338.590654] register_virtio_device+0x9e/0xe0 [25338.595011] virtio_pci_probe+0xb3/0x140 [25338.598941] local_pci_probe+0x41/0x90 [25338.602689] work_for_cpu_fn+0x16/0x20 [25338.606443] process_one_work+0x1a7/0x360 [25338.610456] ? create_worker+0x1a0/0x1a0 [25338.614381] worker_thread+0x1cf/0x390 [25338.618132] ? create_worker+0x1a0/0x1a0 [25338.622051] kthread+0x116/0x130 [25338.625283] ? kthread_flush_work_fn+0x10/0x10 [25338.629731] ret_from_fork+0x1f/0x40 [25338.633395] virtio_blk: probe of virtio418 failed with error -16
The log : "genirq: Flags mismatch irq 0. 00000080 (virtio418) vs. 00015a00 (timer)" was printed because of the irq 0 is used by timer exclusive,and when vp_find_vqs call vp_find_vqs_msix and returns false twice (for whatever reason), then it will call vp_find_vqs_intx as a fallback. Because vp_dev->pci_dev->irq is zero, we request irq 0 with flag IRQF_SHARED, and get a backtrace like above.
According to PCI spec about "Interrupt Pin" Register (Offset 3Dh): "The Interrupt Pin register is a read-only register that identifies the legacy interrupt Message(s) the Function uses. Valid values are 01h, 02h, 03h, and 04h that map to legacy interrupt Messages for INTA, INTB, INTC, and INTD respectively. A value of 00h indicates that the Function uses no legacy interrupt Message(s)."
So if vp_dev->pci_dev->pin is zero, we should not request legacy interrupt.
Signed-off-by: Angus Chen angus.chen@jaguarmicro.com Suggested-by: Michael S. Tsirkin mst@redhat.com Message-Id: 20220930000915.548-1-angus.chen@jaguarmicro.com Signed-off-by: Michael S. Tsirkin mst@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/virtio/virtio_pci_common.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index ca51fcc9daab..8ffa906541e3 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -403,6 +403,9 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); if (!err) return 0; + /* Is there an interrupt pin? If not give up. */ + if (!(to_vp_device(vdev)->pci_dev->pin)) + return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); }
From: Enzo Matsumiya ematsumiya@suse.de
[ Upstream commit a4e430c8c8ba96be8c6ec4f2eb108bb8bcbee069 ]
Replace kfree with kfree_sensitive, or prepend memzero_explicit() in other cases, when freeing sensitive material that could still be left in memory.
Signed-off-by: Enzo Matsumiya ematsumiya@suse.de Reported-by: kernel test robot oliver.sang@intel.com Link: https://lore.kernel.org/r/202209201529.ec633796-oliver.sang@intel.com Reviewed-by: Paulo Alcantara (SUSE) pc@cjr.nz Reviewed-by: Ronnie Sahlberg lsahlber@redhat.com Signed-off-by: Steve French stfrench@microsoft.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/cifs/cifsencrypt.c | 12 ++++++------ fs/cifs/connect.c | 6 +++--- fs/cifs/fs_context.c | 12 ++++++++++-- fs/cifs/misc.c | 2 +- fs/cifs/sess.c | 24 +++++++++++++++--------- fs/cifs/smb2ops.c | 6 +++--- fs/cifs/smb2pdu.c | 19 ++++++++++++++----- 7 files changed, 52 insertions(+), 29 deletions(-)
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 663cb9db4908..419cbdadf248 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -680,7 +680,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) unlock: cifs_server_unlock(ses->server); setup_ntlmv2_rsp_ret: - kfree(tiblob); + kfree_sensitive(tiblob);
return rc; } @@ -754,14 +754,14 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) server->secmech.ccmaesdecrypt = NULL; }
- kfree(server->secmech.sdesccmacaes); + kfree_sensitive(server->secmech.sdesccmacaes); server->secmech.sdesccmacaes = NULL; - kfree(server->secmech.sdeschmacsha256); + kfree_sensitive(server->secmech.sdeschmacsha256); server->secmech.sdeschmacsha256 = NULL; - kfree(server->secmech.sdeschmacmd5); + kfree_sensitive(server->secmech.sdeschmacmd5); server->secmech.sdeschmacmd5 = NULL; - kfree(server->secmech.sdescmd5); + kfree_sensitive(server->secmech.sdescmd5); server->secmech.sdescmd5 = NULL; - kfree(server->secmech.sdescsha512); + kfree_sensitive(server->secmech.sdescsha512); server->secmech.sdescsha512 = NULL; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index bdc3efdb1221..901d9b232e2c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -315,7 +315,7 @@ cifs_abort_connection(struct TCP_Server_Info *server) } server->sequence_number = 0; server->session_estab = false; - kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; @@ -1535,7 +1535,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
cifs_crypto_secmech_release(server);
- kfree(server->session_key.response); + kfree_sensitive(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; kfree(server->hostname); @@ -4059,7 +4059,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (ses->auth_key.response) { cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; ses->auth_key.len = 0; } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 8dc0d923ef6a..ff3ad09f2cd4 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -790,6 +790,13 @@ do { \ cifs_sb->ctx->field = NULL; \ } while (0)
+#define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \ +do { \ + kfree_sensitive(ctx->field); \ + ctx->field = cifs_sb->ctx->field; \ + cifs_sb->ctx->field = NULL; \ +} while (0) + static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); @@ -810,7 +817,7 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); - STEAL_STRING(cifs_sb, ctx, password); + STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); @@ -1154,7 +1161,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } break; case Opt_pass: - kfree(ctx->password); + kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; @@ -1462,6 +1469,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, return 0;
cifs_parse_mount_err: + kfree_sensitive(ctx->password); return -EINVAL; }
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 197f3c09d3f3..2baf23b3d572 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1129,7 +1129,7 @@ cifs_alloc_hash(const char *name, void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) { - kfree(*sdesc); + kfree_sensitive(*sdesc); *sdesc = NULL; if (*shash) crypto_free_shash(*shash); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 02c8b2906196..30143d3d0a34 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1211,6 +1211,12 @@ sess_alloc_buffer(struct sess_data *sess_data, int wct) static void sess_free_buffer(struct sess_data *sess_data) { + int i; + + /* zero the session data before freeing, as it might contain sensitive info (keys, etc) */ + for (i = 0; i < 3; i++) + if (sess_data->iov[i].iov_base) + memzero_explicit(sess_data->iov[i].iov_base, sess_data->iov[i].iov_len);
free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; @@ -1372,7 +1378,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; }
@@ -1511,7 +1517,7 @@ sess_auth_kerberos(struct sess_data *sess_data) sess_data->result = rc; sess_data->func = NULL; sess_free_buffer(sess_data); - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; }
@@ -1646,7 +1652,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data);
@@ -1656,9 +1662,9 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) }
/* Else error. Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL;
sess_data->func = NULL; @@ -1757,7 +1763,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) }
out_free_ntlmsspblob: - kfree(ntlmsspblob); + kfree_sensitive(ntlmsspblob); out: sess_free_buffer(sess_data);
@@ -1765,9 +1771,9 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) rc = sess_establish_session(sess_data);
/* Cleanup */ - kfree(ses->auth_key.response); + kfree_sensitive(ses->auth_key.response); ses->auth_key.response = NULL; - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL;
sess_data->func = NULL; @@ -1843,6 +1849,6 @@ int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, rc = sess_data->result;
out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cc180d37b8ce..340ca57fa361 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4691,11 +4691,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kfree(iv); + kfree_sensitive(iv); free_sg: - kfree(sg); + kfree_sensitive(sg); free_req: - kfree(req); + kfree_sensitive(req); return rc; }
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 31d37afae741..f3032cfafedc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1344,6 +1344,13 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) static void SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) { + int i; + + /* zero the session data before freeing, as it might contain sensitive info (keys, etc) */ + for (i = 0; i < 2; i++) + if (sess_data->iov[i].iov_base) + memzero_explicit(sess_data->iov[i].iov_base, sess_data->iov[i].iov_len); + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); sess_data->buf0_type = CIFS_NO_BUFFER; } @@ -1476,6 +1483,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) out_put_spnego_key: key_invalidate(spnego_key); key_put(spnego_key); + if (rc) + kfree_sensitive(ses->auth_key.response); out: sess_data->result = rc; sess_data->func = NULL; @@ -1572,7 +1581,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) }
out: - kfree(ntlmssp_blob); + memzero_explicit(ntlmssp_blob, blob_length); SMB2_sess_free_buffer(sess_data); if (!rc) { sess_data->result = 0; @@ -1580,7 +1589,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) return; } out_err: - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1656,9 +1665,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) } #endif out: - kfree(ntlmssp_blob); + memzero_explicit(ntlmssp_blob, blob_length); SMB2_sess_free_buffer(sess_data); - kfree(ses->ntlmssp); + kfree_sensitive(ses->ntlmssp); ses->ntlmssp = NULL; sess_data->result = rc; sess_data->func = NULL; @@ -1736,7 +1745,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, cifs_server_dbg(VFS, "signing requested but authenticated as guest\n"); rc = sess_data->result; out: - kfree(sess_data); + kfree_sensitive(sess_data); return rc; }
linux-stable-mirror@lists.linaro.org