When building with clang:
drivers/gpu/drm/mediatek/mtk_drm_gem.c:255:10: error: incompatible integer to pointer conversion returning 'int' from a function with result type 'void *' [-Wint-conversion]
255 | return -ENOMEM;
| ^~~~~~~
1 error generated.
GCC reports the same issue as a warning, rather than an error.
Prior to commit 7e542ff8b463 ("drm/mediatek: Use struct dma_buf_map in
GEM vmap ops"), this function returned a pointer rather than an integer.
This function is indirectly called in drm_gem_vmap(), which treats NULL
as -ENOMEM through an error pointer. Return NULL in this block to
resolve the warning but keep the same end result.
Fixes: 43f561e809aa ("drm/mediatek: Fix potential memory leak if vmap() fail")
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
This is a fix for a 5.10 backport, so it has no upstream relevance but
I've still cc'd the relevant maintainers in case they have any comments
or want to double check my work.
---
drivers/gpu/drm/mediatek/mtk_drm_gem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
index fe64bf2176f3..b20ea58907c2 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
@@ -252,7 +252,7 @@ void *mtk_drm_gem_prime_vmap(struct drm_gem_object *obj)
if (!mtk_gem->kvaddr) {
kfree(sgt);
kfree(mtk_gem->pages);
- return -ENOMEM;
+ return NULL;
}
out:
kfree(sgt);
---
base-commit: ff0bfa8f23eb4c5a65ee6b0d0b7dc2e3439f1063
change-id: 20230922-5-10-fix-drm-mediatek-backport-0ee69329fef0
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
From: Wayne Lin <wayne.lin(a)amd.com>
[Why]
In drm_dp_mst_topology_mgr_resume() today, it will resume the
mst branch to be ready handling mst mode and also consecutively do
the mst topology probing. Which will cause the dirver have chance
to fire hotplug event before restoring the old state. Then Userspace
will react to the hotplug event based on a wrong state.
[How]
Adjust the mst resume flow as:
1. set dpcd to resume mst branch status
2. restore source old state
3. Do mst resume topology probing
For drm_dp_mst_topology_mgr_resume(), it's better to adjust it to
pull out topology probing work into a 2nd part procedure of the mst
resume. Will have a follow up patch in drm.
Reviewed-by: Chao-kai Wang <stylon.wang(a)amd.com>
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Cc: stable(a)vger.kernel.org
Acked-by: Stylon Wang <stylon.wang(a)amd.com>
Signed-off-by: Wayne Lin <wayne.lin(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit ec5fa9fcdeca69edf7dab5ca3b2e0ceb1c08fe9a)
Adjust for missing variable rename in
f0127cb11299 ("drm/amdgpu/display/mst: adjust the naming of mst_port and port of aconnector")
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
---
This is a follow up for https://lore.kernel.org/stable/2023092029-banter-truth-cf72@gregkh/
.../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 93 ++++++++++++++++---
1 file changed, 80 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index c8e562dcd99d..b98ee4b41863 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2339,14 +2339,62 @@ static int dm_late_init(void *handle)
return detect_mst_link_for_all_connectors(adev_to_drm(adev));
}
+static void resume_mst_branch_status(struct drm_dp_mst_topology_mgr *mgr)
+{
+ int ret;
+ u8 guid[16];
+ u64 tmp64;
+
+ mutex_lock(&mgr->lock);
+ if (!mgr->mst_primary)
+ goto out_fail;
+
+ if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) {
+ drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during suspend?\n");
+ goto out_fail;
+ }
+
+ ret = drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL,
+ DP_MST_EN |
+ DP_UP_REQ_EN |
+ DP_UPSTREAM_IS_SRC);
+ if (ret < 0) {
+ drm_dbg_kms(mgr->dev, "mst write failed - undocked during suspend?\n");
+ goto out_fail;
+ }
+
+ /* Some hubs forget their guids after they resume */
+ ret = drm_dp_dpcd_read(mgr->aux, DP_GUID, guid, 16);
+ if (ret != 16) {
+ drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during suspend?\n");
+ goto out_fail;
+ }
+
+ if (memchr_inv(guid, 0, 16) == NULL) {
+ tmp64 = get_jiffies_64();
+ memcpy(&guid[0], &tmp64, sizeof(u64));
+ memcpy(&guid[8], &tmp64, sizeof(u64));
+
+ ret = drm_dp_dpcd_write(mgr->aux, DP_GUID, guid, 16);
+
+ if (ret != 16) {
+ drm_dbg_kms(mgr->dev, "check mstb guid failed - undocked during suspend?\n");
+ goto out_fail;
+ }
+ }
+
+ memcpy(mgr->mst_primary->guid, guid, 16);
+
+out_fail:
+ mutex_unlock(&mgr->lock);
+}
+
static void s3_handle_mst(struct drm_device *dev, bool suspend)
{
struct amdgpu_dm_connector *aconnector;
struct drm_connector *connector;
struct drm_connector_list_iter iter;
struct drm_dp_mst_topology_mgr *mgr;
- int ret;
- bool need_hotplug = false;
drm_connector_list_iter_begin(dev, &iter);
drm_for_each_connector_iter(connector, &iter) {
@@ -2368,18 +2416,15 @@ static void s3_handle_mst(struct drm_device *dev, bool suspend)
if (!dp_is_lttpr_present(aconnector->dc_link))
dc_link_aux_try_to_configure_timeout(aconnector->dc_link->ddc, LINK_AUX_DEFAULT_TIMEOUT_PERIOD);
- ret = drm_dp_mst_topology_mgr_resume(mgr, true);
- if (ret < 0) {
- dm_helpers_dp_mst_stop_top_mgr(aconnector->dc_link->ctx,
- aconnector->dc_link);
- need_hotplug = true;
- }
+ /* TODO: move resume_mst_branch_status() into drm mst resume again
+ * once topology probing work is pulled out from mst resume into mst
+ * resume 2nd step. mst resume 2nd step should be called after old
+ * state getting restored (i.e. drm_atomic_helper_resume()).
+ */
+ resume_mst_branch_status(mgr);
}
}
drm_connector_list_iter_end(&iter);
-
- if (need_hotplug)
- drm_kms_helper_hotplug_event(dev);
}
static int amdgpu_dm_smu_write_watermarks_table(struct amdgpu_device *adev)
@@ -2768,7 +2813,8 @@ static int dm_resume(void *handle)
struct dm_atomic_state *dm_state = to_dm_atomic_state(dm->atomic_obj.state);
enum dc_connection_type new_connection_type = dc_connection_none;
struct dc_state *dc_state;
- int i, r, j;
+ int i, r, j, ret;
+ bool need_hotplug = false;
if (amdgpu_in_reset(adev)) {
dc_state = dm->cached_dc_state;
@@ -2866,7 +2912,7 @@ static int dm_resume(void *handle)
continue;
/*
- * this is the case when traversing through already created
+ * this is the case when traversing through already created end sink
* MST connectors, should be skipped
*/
if (aconnector && aconnector->mst_port)
@@ -2926,6 +2972,27 @@ static int dm_resume(void *handle)
dm->cached_state = NULL;
+ /* Do mst topology probing after resuming cached state*/
+ drm_connector_list_iter_begin(ddev, &iter);
+ drm_for_each_connector_iter(connector, &iter) {
+ aconnector = to_amdgpu_dm_connector(connector);
+ if (aconnector->dc_link->type != dc_connection_mst_branch ||
+ aconnector->mst_port)
+ continue;
+
+ ret = drm_dp_mst_topology_mgr_resume(&aconnector->mst_mgr, true);
+
+ if (ret < 0) {
+ dm_helpers_dp_mst_stop_top_mgr(aconnector->dc_link->ctx,
+ aconnector->dc_link);
+ need_hotplug = true;
+ }
+ }
+ drm_connector_list_iter_end(&iter);
+
+ if (need_hotplug)
+ drm_kms_helper_hotplug_event(ddev);
+
amdgpu_dm_irq_resume_late(adev);
amdgpu_dm_smu_write_watermarks_table(adev);
--
2.34.1
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 479965a2b7ec481737df0cadf553331063b9c343
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100424-cheer-freeness-471f@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 479965a2b7ec481737df0cadf553331063b9c343 Mon Sep 17 00:00:00 2001
From: Kristina Martsenko <kristina.martsenko(a)arm.com>
Date: Tue, 12 Sep 2023 14:34:29 +0100
Subject: [PATCH] arm64: cpufeature: Fix CLRBHB and BC detection
ClearBHB support is indicated by the CLRBHB field in ID_AA64ISAR2_EL1.
Following some refactoring the kernel incorrectly checks the BC field
instead. Fix the detection to use the right field.
(Note: The original ClearBHB support had it as FTR_HIGHER_SAFE, but this
patch uses FTR_LOWER_SAFE, which seems more correct.)
Also fix the detection of BC (hinted conditional branches) to use
FTR_LOWER_SAFE, so that it is not reported on mismatched systems.
Fixes: 356137e68a9f ("arm64/sysreg: Make BHB clear feature defines match the architecture")
Fixes: 8fcc8285c0e3 ("arm64/sysreg: Convert ID_AA64ISAR2_EL1 to automatic generation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Kristina Martsenko <kristina.martsenko(a)arm.com>
Reviewed-by: Mark Brown <broonie(a)kernel.org>
Link: https://lore.kernel.org/r/20230912133429.2606875-1-kristina.martsenko@arm.c…
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 96e50227f940..5bba39376055 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -663,7 +663,7 @@ static inline bool supports_clearbhb(int scope)
isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
return cpuid_feature_extract_unsigned_field(isar2,
- ID_AA64ISAR2_EL1_BC_SHIFT);
+ ID_AA64ISAR2_EL1_CLRBHB_SHIFT);
}
const struct cpumask *system_32bit_el0_cpumask(void);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index b018ae12ff5f..444a73c2e638 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -222,7 +222,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_MOPS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0),
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 2517ef7c21cf..76ce150e7347 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1347,7 +1347,11 @@ UnsignedEnum 51:48 RPRFM
0b0000 NI
0b0001 IMP
EndEnum
-Res0 47:28
+Res0 47:32
+UnsignedEnum 31:28 CLRBHB
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
UnsignedEnum 27:24 PAC_frac
0b0000 NI
0b0001 IMP
This series adds a new sysctl accept_ra_min_lft which enforces a minimum
lifetime value for individual RA sections; in particular, router
lifetime, PIO preferred lifetime, and RIO lifetime. If any of those
lifetimes are lower than the configured value, the specific RA section
is ignored.
This fixes a potential denial of service attack vector where rogue WiFi
routers (or devices) can send RAs with low lifetimes to actively drain a
mobile device's battery (by preventing sleep).
In addition to this change, Android uses hardware offloads to drop RAs
for a fraction of the minimum of all lifetimes present in the RA (some
networks have very frequent RAs (5s) with high lifetimes (2h)). Despite
this, we have encountered networks that set the router lifetime to 30s
which results in very frequent CPU wakeups. Instead of disabling IPv6
(and dropping IPv6 ethertype in the WiFi firmware) entirely on such
networks, misconfigured routers must be ignored while still processing
RAs from other IPv6 routers on the same network (i.e. to support IoT
applications).
Patches:
- 1671bcfd76fd ("net: add sysctl accept_ra_min_rtr_lft")
- 5027d54a9c30 ("net: change accept_ra_min_rtr_lft to affect all RA lifetimes")
- 5cb249686e67 ("net: release reference to inet6_dev pointer")
Patrick Rohr (3):
net: add sysctl accept_ra_min_rtr_lft
net: change accept_ra_min_rtr_lft to affect all RA lifetimes
net: release reference to inet6_dev pointer
Documentation/networking/ip-sysctl.rst | 8 ++++++++
include/linux/ipv6.h | 1 +
include/uapi/linux/ipv6.h | 1 +
net/ipv6/addrconf.c | 13 +++++++++++++
net/ipv6/ndisc.c | 13 +++++++++++--
5 files changed, 34 insertions(+), 2 deletions(-)
--
2.42.0.515.g380fc7ccd1-goog
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: ccs: Correctly initialise try compose rectangle
Author: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Date: Mon Sep 4 15:57:37 2023 +0300
Initialise the try sink compose rectangle size to the sink compose
rectangle for binner and scaler sub-devices. This was missed due to the
faulty condition that lead to the compose rectangles to be initialised for
the pixel array sub-device where it is not relevant.
Fixes: ccfc97bdb5ae ("[media] smiapp: Add driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
drivers/media/i2c/ccs/ccs-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
---
diff --git a/drivers/media/i2c/ccs/ccs-core.c b/drivers/media/i2c/ccs/ccs-core.c
index 6a8116454f87..022e8712d48e 100644
--- a/drivers/media/i2c/ccs/ccs-core.c
+++ b/drivers/media/i2c/ccs/ccs-core.c
@@ -3097,7 +3097,7 @@ static int ccs_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
try_fmt->code = sensor->internal_csi_format->code;
try_fmt->field = V4L2_FIELD_NONE;
- if (ssd != sensor->pixel_array)
+ if (ssd == sensor->pixel_array)
continue;
try_comp = v4l2_subdev_get_try_compose(sd, fh->state, i);
Hi there,
I found a nullptr dereference in perf subsystem and it affects at least
v5.10 and v6.1 stable trees. (the same poc cannot trigger the crash in
the mainline).
I fail to find the root cause the bug. All I know is that it is a race
condition in the logic of moving_groups from pure software-based perf
events to hardware ones. More specifically, when we add a hardware perf
event to a software event group, it will trigger a "move_group" logic in
perf_event_open. When the "move_group" logic happens, it will remove all
existing events from the context first using `perf_remove_from_context`.
And it will invoke `__perf_remove_from_context` through
`event_function_call`.
Notice that `event_function_call` is defined as follow:
~~~
static void event_function_call(struct perf_event *event, event_f func, void *data)
{
...
func(event, NULL, ctx, data);
...
}
~~~
This means `__perf_remove_from_context` will be invoked with
cpuctx==NULL, which leads to invoking `event_sched_out` with cpuctx ==
NULL.
At this moment, as long as the event is active, we are going to invoke
the `if (event->attr.exclusive || !cpuctx->active_oncpu)` logic, which
is a null pointer deference.
I don't know the proper way to patch this bug. So I'm asking for help.
A reproducer is attached to this email.
Best,
Kyle Zeng
If of_clk_add_provider() fails in ca8210_register_ext_clock(),
it calls clk_unregister() to release priv->clk and returns an
error. However, the caller ca8210_probe() then calls ca8210_remove(),
where priv->clk is freed again in ca8210_unregister_ext_clock(). In
this case, a use-after-free may happen in the second time we call
clk_unregister().
Fix this by removing the first clk_unregister(). Also, priv->clk could
be an error code on failure of clk_register_fixed_rate(). Use
IS_ERR_OR_NULL to catch this case in ca8210_unregister_ext_clock().
Fixes: ded845a781a5 ("ieee802154: Add CA8210 IEEE 802.15.4 device driver")
Signed-off-by: Dinghao Liu <dinghao.liu(a)zju.edu.cn>
---
Changelog:
v2: -Remove the first clk_unregister() instead of nulling priv->clk.
v3: -Simplify ca8210_register_ext_clock().
-Add a ';' after return in ca8210_unregister_ext_clock().
---
drivers/net/ieee802154/ca8210.c | 16 +++-------------
1 file changed, 3 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c
index aebb19f1b3a4..ae44a9133937 100644
--- a/drivers/net/ieee802154/ca8210.c
+++ b/drivers/net/ieee802154/ca8210.c
@@ -2757,18 +2757,8 @@ static int ca8210_register_ext_clock(struct spi_device *spi)
dev_crit(&spi->dev, "Failed to register external clk\n");
return PTR_ERR(priv->clk);
}
- ret = of_clk_add_provider(np, of_clk_src_simple_get, priv->clk);
- if (ret) {
- clk_unregister(priv->clk);
- dev_crit(
- &spi->dev,
- "Failed to register external clock as clock provider\n"
- );
- } else {
- dev_info(&spi->dev, "External clock set as clock provider\n");
- }
- return ret;
+ return of_clk_add_provider(np, of_clk_src_simple_get, priv->clk);
}
/**
@@ -2780,8 +2770,8 @@ static void ca8210_unregister_ext_clock(struct spi_device *spi)
{
struct ca8210_priv *priv = spi_get_drvdata(spi);
- if (!priv->clk)
- return
+ if (IS_ERR_OR_NULL(priv->clk))
+ return;
of_clk_del_provider(spi->dev.of_node);
clk_unregister(priv->clk);
--
2.17.1
The quilt patch titled
Subject: mm: make PR_MDWE_REFUSE_EXEC_GAIN an unsigned long
has been removed from the -mm tree. Its filename was
mm-make-pr_mdwe_refuse_exec_gain-an-unsigned-long.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Florent Revest <revest(a)chromium.org>
Subject: mm: make PR_MDWE_REFUSE_EXEC_GAIN an unsigned long
Date: Mon, 28 Aug 2023 17:08:56 +0200
Defining a prctl flag as an int is a footgun because on a 64 bit machine
and with a variadic implementation of prctl (like in musl and glibc), when
used directly as a prctl argument, it can get casted to long with garbage
upper bits which would result in unexpected behaviors.
This patch changes the constant to an unsigned long to eliminate that
possibilities. This does not break UAPI.
I think that a stable backport would be "nice to have": to reduce the
chances that users build binaries that could end up with garbage bits in
their MDWE prctl arguments. We are not aware of anyone having yet
encountered this corner case with MDWE prctls but a backport would reduce
the likelihood it happens, since this sort of issues has happened with
other prctls. But If this is perceived as a backporting burden, I suppose
we could also live without a stable backport.
Link: https://lkml.kernel.org/r/20230828150858.393570-5-revest@chromium.org
Fixes: b507808ebce2 ("mm: implement memory-deny-write-execute as a prctl")
Signed-off-by: Florent Revest <revest(a)chromium.org>
Suggested-by: Alexey Izbyshev <izbyshev(a)ispras.ru>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Acked-by: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Anshuman Khandual <anshuman.khandual(a)arm.com>
Cc: Ayush Jain <ayush.jain3(a)amd.com>
Cc: Greg Thelen <gthelen(a)google.com>
Cc: Joey Gouly <joey.gouly(a)arm.com>
Cc: KP Singh <kpsingh(a)kernel.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Szabolcs Nagy <Szabolcs.Nagy(a)arm.com>
Cc: Topi Miettinen <toiwoton(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/uapi/linux/prctl.h | 2 +-
tools/include/uapi/linux/prctl.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
--- a/include/uapi/linux/prctl.h~mm-make-pr_mdwe_refuse_exec_gain-an-unsigned-long
+++ a/include/uapi/linux/prctl.h
@@ -283,7 +283,7 @@ struct prctl_mm_map {
/* Memory deny write / execute */
#define PR_SET_MDWE 65
-# define PR_MDWE_REFUSE_EXEC_GAIN 1
+# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0)
#define PR_GET_MDWE 66
--- a/tools/include/uapi/linux/prctl.h~mm-make-pr_mdwe_refuse_exec_gain-an-unsigned-long
+++ a/tools/include/uapi/linux/prctl.h
@@ -283,7 +283,7 @@ struct prctl_mm_map {
/* Memory deny write / execute */
#define PR_SET_MDWE 65
-# define PR_MDWE_REFUSE_EXEC_GAIN 1
+# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0)
#define PR_GET_MDWE 66
_
Patches currently in -mm which might be from revest(a)chromium.org are
The quilt patch titled
Subject: mm/migrate: fix do_pages_move for compat pointers
has been removed from the -mm tree. Its filename was
mm-migrate-fix-do_pages_move-for-compat-pointers.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Gregory Price <gourry.memverge(a)gmail.com>
Subject: mm/migrate: fix do_pages_move for compat pointers
Date: Tue, 3 Oct 2023 10:48:56 -0400
do_pages_move does not handle compat pointers for the page list.
correctly. Add in_compat_syscall check and appropriate get_user fetch
when iterating the page list.
It makes the syscall in compat mode (32-bit userspace, 64-bit kernel)
work the same way as the native 32-bit syscall again, restoring the
behavior before my broken commit 5b1b561ba73c ("mm: simplify
compat_sys_move_pages").
More specifically, my patch moved the parsing of the 'pages' array from
the main entry point into do_pages_stat(), which left the syscall
working correctly for the 'stat' operation (nodes = NULL), while the
'move' operation (nodes != NULL) is now missing the conversion and
interprets 'pages' as an array of 64-bit pointers instead of the
intended 32-bit userspace pointers.
It is possible that nobody noticed this bug because the few
applications that actually call move_pages are unlikely to run in
compat mode because of their large memory requirements, but this
clearly fixes a user-visible regression and should have been caught by
ltp.
Link: https://lkml.kernel.org/r/20231003144857.752952-1-gregory.price@memverge.com
Fixes: 5b1b561ba73c ("mm: simplify compat_sys_move_pages")
Signed-off-by: Gregory Price <gregory.price(a)memverge.com>
Reported-by: Arnd Bergmann <arnd(a)arndb.de>
Co-developed-by: Arnd Bergmann <arnd(a)arndb.de>
Cc: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
--- a/mm/migrate.c~mm-migrate-fix-do_pages_move-for-compat-pointers
+++ a/mm/migrate.c
@@ -2162,6 +2162,7 @@ static int do_pages_move(struct mm_struc
const int __user *nodes,
int __user *status, int flags)
{
+ compat_uptr_t __user *compat_pages = (void __user *)pages;
int current_node = NUMA_NO_NODE;
LIST_HEAD(pagelist);
int start, i;
@@ -2174,8 +2175,17 @@ static int do_pages_move(struct mm_struc
int node;
err = -EFAULT;
- if (get_user(p, pages + i))
- goto out_flush;
+ if (in_compat_syscall()) {
+ compat_uptr_t cp;
+
+ if (get_user(cp, compat_pages + i))
+ goto out_flush;
+
+ p = compat_ptr(cp);
+ } else {
+ if (get_user(p, pages + i))
+ goto out_flush;
+ }
if (get_user(node, nodes + i))
goto out_flush;
_
Patches currently in -mm which might be from gourry.memverge(a)gmail.com are
mm-migrate-remove-unused-mm-argument-from-do_move_pages_to_node.patch
The quilt patch titled
Subject: mm/mempolicy: fix set_mempolicy_home_node() previous VMA pointer
has been removed from the -mm tree. Its filename was
mm-mempolicy-fix-set_mempolicy_home_node-previous-vma-pointer.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mm/mempolicy: fix set_mempolicy_home_node() previous VMA pointer
Date: Thu, 28 Sep 2023 13:24:32 -0400
The two users of mbind_range() are expecting that mbind_range() will
update the pointer to the previous VMA, or return an error. However,
set_mempolicy_home_node() does not call mbind_range() if there is no VMA
policy. The fix is to update the pointer to the previous VMA prior to
continuing iterating the VMAs when there is no policy.
Users may experience a WARN_ON() during VMA policy updates when updating
a range of VMAs on the home node.
Link: https://lkml.kernel.org/r/20230928172432.2246534-1-Liam.Howlett@oracle.com
Link: https://lore.kernel.org/linux-mm/CALcu4rbT+fMVNaO_F2izaCT+e7jzcAciFkOvk21HG…
Fixes: f4e9e0e69468 ("mm/mempolicy: fix use-after-free of VMA iterator")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: Yikebaer Aizezi <yikebaer61(a)gmail.com>
Closes: https://lore.kernel.org/linux-mm/CALcu4rbT+fMVNaO_F2izaCT+e7jzcAciFkOvk21HG…
Reviewed-by: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mempolicy.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/mm/mempolicy.c~mm-mempolicy-fix-set_mempolicy_home_node-previous-vma-pointer
+++ a/mm/mempolicy.c
@@ -1543,8 +1543,10 @@ SYSCALL_DEFINE4(set_mempolicy_home_node,
* the home node for vmas we already updated before.
*/
old = vma_policy(vma);
- if (!old)
+ if (!old) {
+ prev = vma;
continue;
+ }
if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
err = -EOPNOTSUPP;
break;
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
mmap-add-clarifying-comment-to-vma_merge-code.patch
radix-tree-test-suite-fix-allocation-calculation-in-kmem_cache_alloc_bulk.patch
The quilt patch titled
Subject: mmap: fix vma_iterator in error path of vma_merge()
has been removed from the -mm tree. Its filename was
mmap-fix-vma_iterator-in-error-path-of-vma_merge.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mmap: fix vma_iterator in error path of vma_merge()
Date: Fri, 29 Sep 2023 14:30:39 -0400
During the error path, the vma iterator may not be correctly positioned or
set to the correct range. Undo the vma_prev() call by resetting to the
passed in address. Re-walking to the same range will fix the range to the
area previously passed in.
Users would notice increased cycles as vma_merge() would be called an
extra time with vma == prev, and thus would fail to merge and return.
Link: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejt…
Link: https://lkml.kernel.org/r/20230929183041.2835469-2-Liam.Howlett@oracle.com
Fixes: 18b098af2890 ("vma_merge: set vma iterator to correct position.")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Closes: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejt…
Reviewed-by: Lorenzo Stoakes <lstoakes(a)gmail.com>
Acked-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/mm/mmap.c~mmap-fix-vma_iterator-in-error-path-of-vma_merge
+++ a/mm/mmap.c
@@ -975,7 +975,7 @@ struct vm_area_struct *vma_merge(struct
/* Error in anon_vma clone. */
if (err)
- return NULL;
+ goto anon_vma_fail;
if (vma_start < vma->vm_start || vma_end > vma->vm_end)
vma_expanded = true;
@@ -988,7 +988,7 @@ struct vm_area_struct *vma_merge(struct
}
if (vma_iter_prealloc(vmi, vma))
- return NULL;
+ goto prealloc_fail;
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
@@ -1016,6 +1016,12 @@ struct vm_area_struct *vma_merge(struct
vma_complete(&vp, vmi, mm);
khugepaged_enter_vma(res, vm_flags);
return res;
+
+prealloc_fail:
+anon_vma_fail:
+ vma_iter_set(vmi, addr);
+ vma_iter_load(vmi);
+ return NULL;
}
/*
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
mmap-add-clarifying-comment-to-vma_merge-code.patch
radix-tree-test-suite-fix-allocation-calculation-in-kmem_cache_alloc_bulk.patch
The quilt patch titled
Subject: mm/page_alloc: correct start page when guard page debug is enabled
has been removed from the -mm tree. Its filename was
mm-page_alloc-correct-start-page-when-guard-page-debug-is-enabled.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Kemeng Shi <shikemeng(a)huaweicloud.com>
Subject: mm/page_alloc: correct start page when guard page debug is enabled
Date: Wed, 27 Sep 2023 17:44:01 +0800
When guard page debug is enabled and set_page_guard returns success, we
miss to forward page to point to start of next split range and we will do
split unexpectedly in page range without target page. Move start page
update before set_page_guard to fix this.
As we split to wrong target page, then splited pages are not able to merge
back to original order when target page is put back and splited pages
except target page is not usable. To be specific:
Consider target page is the third page in buddy page with order 2.
| buddy-2 | Page | Target | Page |
After break down to target page, we will only set first page to Guard
because of bug.
| Guard | Page | Target | Page |
When we try put_page_back_buddy with target page, the buddy page of target
if neither guard nor buddy, Then it's not able to construct original page
with order 2
| Guard | Page | buddy-0 | Page |
All pages except target page is not in free list and is not usable.
Link: https://lkml.kernel.org/r/20230927094401.68205-1-shikemeng@huaweicloud.com
Fixes: 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages")
Signed-off-by: Kemeng Shi <shikemeng(a)huaweicloud.com>
Acked-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/page_alloc.c~mm-page_alloc-correct-start-page-when-guard-page-debug-is-enabled
+++ a/mm/page_alloc.c
@@ -6475,6 +6475,7 @@ static void break_down_buddy_pages(struc
next_page = page;
current_buddy = page + size;
}
+ page = next_page;
if (set_page_guard(zone, current_buddy, high, migratetype))
continue;
@@ -6482,7 +6483,6 @@ static void break_down_buddy_pages(struc
if (current_buddy != target) {
add_to_free_list(current_buddy, zone, high, migratetype);
set_buddy_order(current_buddy, high);
- page = next_page;
}
}
}
_
Patches currently in -mm which might be from shikemeng(a)huaweicloud.com are
mm-page_alloc-remove-unnecessary-check-in-break_down_buddy_pages.patch
mm-page_alloc-remove-unnecessary-next_page-in-break_down_buddy_pages.patch
When a zswap store fails due to the limit, it acquires a pool
reference and queues the shrinker. When the shrinker runs, it drops
the reference. However, there can be multiple store attempts before
the shrinker wakes up and runs once. This results in reference leaks
and eventual saturation warnings for the pool refcount.
Fix this by dropping the reference again when the shrinker is already
queued. This ensures one reference per shrinker run.
Reported-by: Chris Mason <clm(a)fb.com>
Fixes: 45190f01dd40 ("mm/zswap.c: add allocation hysteresis if pool limit is hit")
Cc: stable(a)vger.kernel.org [5.6+]
Cc: Vitaly Wool <vitaly.wool(a)konsulko.com>
Cc: Domenico Cerasuolo <cerasuolodomenico(a)gmail.com>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
---
mm/zswap.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index 083c693602b8..37d2b1cb2ecb 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1383,8 +1383,8 @@ bool zswap_store(struct folio *folio)
shrink:
pool = zswap_pool_last_get();
- if (pool)
- queue_work(shrink_wq, &pool->shrink_work);
+ if (pool && !queue_work(shrink_wq, &pool->shrink_work))
+ zswap_pool_put(pool);
goto reject;
}
--
2.42.0
The patch titled
Subject: hugetlbfs: close race between MADV_DONTNEED and page fault
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
hugetlbfs-close-race-between-madv_dontneed-and-page-fault.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Rik van Riel <riel(a)surriel.com>
Subject: hugetlbfs: close race between MADV_DONTNEED and page fault
Date: Thu, 5 Oct 2023 23:59:08 -0400
Malloc libraries, like jemalloc and tcalloc, take decisions on when to
call madvise independently from the code in the main application.
This sometimes results in the application page faulting on an address,
right after the malloc library has shot down the backing memory with
MADV_DONTNEED.
Usually this is harmless, because we always have some 4kB pages sitting
around to satisfy a page fault. However, with hugetlbfs systems often
allocate only the exact number of huge pages that the application wants.
Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of
any lock taken on the page fault path, which can open up the following
race condition:
CPU 1 CPU 2
MADV_DONTNEED
unmap page
shoot down TLB entry
page fault
fail to allocate a huge page
killed with SIGBUS
free page
Fix that race by pulling the locking from __unmap_hugepage_final_range
into helper functions called from zap_page_range_single. This ensures
page faults stay locked out of the MADV_DONTNEED VMA until the huge pages
have actually been freed.
Link: https://lkml.kernel.org/r/20231006040020.3677377-4-riel@surriel.com
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++--
mm/hugetlb.c | 34 ++++++++++++++++++++++------------
mm/memory.c | 13 ++++++++-----
3 files changed, 63 insertions(+), 19 deletions(-)
--- a/include/linux/hugetlb.h~hugetlbfs-close-race-between-madv_dontneed-and-page-fault
+++ a/include/linux/hugetlb.h
@@ -139,7 +139,7 @@ struct page *hugetlb_follow_page_mask(st
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *,
zap_flags_t);
-void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+void __unmap_hugepage_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page, zap_flags_t zap_flags);
@@ -246,6 +246,25 @@ int huge_pmd_unshare(struct mm_struct *m
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
+extern void __hugetlb_zap_begin(struct vm_area_struct *vma,
+ unsigned long *begin, unsigned long *end);
+extern void __hugetlb_zap_end(struct vm_area_struct *vma,
+ struct zap_details *details);
+
+static inline void hugetlb_zap_begin(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+ if (is_vm_hugetlb_page(vma))
+ __hugetlb_zap_begin(vma, start, end);
+}
+
+static inline void hugetlb_zap_end(struct vm_area_struct *vma,
+ struct zap_details *details)
+{
+ if (is_vm_hugetlb_page(vma))
+ __hugetlb_zap_end(vma, details);
+}
+
void hugetlb_vma_lock_read(struct vm_area_struct *vma);
void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
void hugetlb_vma_lock_write(struct vm_area_struct *vma);
@@ -297,6 +316,18 @@ static inline void adjust_range_if_pmd_s
{
}
+static inline void hugetlb_zap_begin(
+ struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
+{
+}
+
+static inline void hugetlb_zap_end(
+ struct vm_area_struct *vma,
+ struct zap_details *details)
+{
+}
+
static inline struct page *hugetlb_follow_page_mask(
struct vm_area_struct *vma, unsigned long address, unsigned int flags,
unsigned int *page_mask)
@@ -442,7 +473,7 @@ static inline long hugetlb_change_protec
return 0;
}
-static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
--- a/mm/hugetlb.c~hugetlbfs-close-race-between-madv_dontneed-and-page-fault
+++ a/mm/hugetlb.c
@@ -5306,9 +5306,9 @@ int move_hugetlb_page_tables(struct vm_a
return len + old_addr - old_end;
}
-static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct page *ref_page, zap_flags_t zap_flags)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -5437,16 +5437,25 @@ static void __unmap_hugepage_range(struc
tlb_flush_mmu_tlbonly(tlb);
}
-void __unmap_hugepage_range_final(struct mmu_gather *tlb,
- struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page,
- zap_flags_t zap_flags)
+void __hugetlb_zap_begin(struct vm_area_struct *vma,
+ unsigned long *start, unsigned long *end)
{
+ if (!vma->vm_file) /* hugetlbfs_file_mmap error */
+ return;
+
+ adjust_range_if_pmd_sharing_possible(vma, start, end);
hugetlb_vma_lock_write(vma);
- i_mmap_lock_write(vma->vm_file->f_mapping);
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+}
+
+void __hugetlb_zap_end(struct vm_area_struct *vma,
+ struct zap_details *details)
+{
+ zap_flags_t zap_flags = details ? details->zap_flags : 0;
- /* mmu notification performed in caller */
- __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
+ if (!vma->vm_file) /* hugetlbfs_file_mmap error */
+ return;
if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */
/*
@@ -5459,11 +5468,12 @@ void __unmap_hugepage_range_final(struct
* someone else.
*/
__hugetlb_vma_unlock_write_free(vma);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
} else {
- i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
}
+
+ if (vma->vm_file)
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
--- a/mm/memory.c~hugetlbfs-close-race-between-madv_dontneed-and-page-fault
+++ a/mm/memory.c
@@ -1683,7 +1683,7 @@ static void unmap_single_vma(struct mmu_
if (vma->vm_file) {
zap_flags_t zap_flags = details ?
details->zap_flags : 0;
- __unmap_hugepage_range_final(tlb, vma, start, end,
+ __unmap_hugepage_range(tlb, vma, start, end,
NULL, zap_flags);
}
} else
@@ -1728,8 +1728,12 @@ void unmap_vmas(struct mmu_gather *tlb,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
do {
- unmap_single_vma(tlb, vma, start_addr, end_addr, &details,
+ unsigned long start = start_addr;
+ unsigned long end = end_addr;
+ hugetlb_zap_begin(vma, &start, &end);
+ unmap_single_vma(tlb, vma, start, end, &details,
mm_wr_locked);
+ hugetlb_zap_end(vma, &details);
} while ((vma = mas_find(mas, tree_end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -1753,9 +1757,7 @@ void zap_page_range_single(struct vm_are
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
- if (is_vm_hugetlb_page(vma))
- adjust_range_if_pmd_sharing_possible(vma, &range.start,
- &range.end);
+ hugetlb_zap_begin(vma, &range.start, &range.end);
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1766,6 +1768,7 @@ void zap_page_range_single(struct vm_are
unmap_single_vma(&tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
+ hugetlb_zap_end(vma, details);
}
/**
_
Patches currently in -mm which might be from riel(a)surriel.com are
hugetlbfs-clear-resv_map-pointer-if-mmap-fails.patch
hugetlbfs-extend-hugetlb_vma_lock-to-private-vmas.patch
hugetlbfs-close-race-between-madv_dontneed-and-page-fault.patch
The patch titled
Subject: hugetlbfs: clear resv_map pointer if mmap fails
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
hugetlbfs-clear-resv_map-pointer-if-mmap-fails.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Rik van Riel <riel(a)surriel.com>
Subject: hugetlbfs: clear resv_map pointer if mmap fails
Date: Thu, 5 Oct 2023 23:59:06 -0400
Patch series "hugetlbfs: close race between MADV_DONTNEED and page fault", v7.
Malloc libraries, like jemalloc and tcalloc, take decisions on when to
call madvise independently from the code in the main application.
This sometimes results in the application page faulting on an address,
right after the malloc library has shot down the backing memory with
MADV_DONTNEED.
Usually this is harmless, because we always have some 4kB pages sitting
around to satisfy a page fault. However, with hugetlbfs systems often
allocate only the exact number of huge pages that the application wants.
Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of
any lock taken on the page fault path, which can open up the following
race condition:
CPU 1 CPU 2
MADV_DONTNEED
unmap page
shoot down TLB entry
page fault
fail to allocate a huge page
killed with SIGBUS
free page
Fix that race by extending the hugetlb_vma_lock locking scheme to also
cover private hugetlb mappings (with resv_map), and pulling the locking
from __unmap_hugepage_final_range into helper functions called from
zap_page_range_single. This ensures page faults stay locked out of the
MADV_DONTNEED VMA until the huge pages have actually been freed.
This patch (of 3):
Hugetlbfs leaves a dangling pointer in the VMA if mmap fails. This has
not been a problem so far, but other code in this patch series tries to
follow that pointer.
Link: https://lkml.kernel.org/r/20231006040020.3677377-1-riel@surriel.com
Link: https://lkml.kernel.org/r/20231006040020.3677377-2-riel@surriel.com
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/mm/hugetlb.c~hugetlbfs-clear-resv_map-pointer-if-mmap-fails
+++ a/mm/hugetlb.c
@@ -1138,8 +1138,7 @@ static void set_vma_resv_map(struct vm_a
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
- set_vma_private_data(vma, (get_vma_private_data(vma) &
- HPAGE_RESV_MASK) | (unsigned long)map);
+ set_vma_private_data(vma, (unsigned long)map);
}
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
@@ -6811,8 +6810,10 @@ out_err:
*/
if (chg >= 0 && add < 0)
region_abort(resv_map, from, to, regions_needed);
- if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
kref_put(&resv_map->refs, resv_map_release);
+ set_vma_resv_map(vma, NULL);
+ }
return false;
}
_
Patches currently in -mm which might be from riel(a)surriel.com are
hugetlbfs-clear-resv_map-pointer-if-mmap-fails.patch
hugetlbfs-extend-hugetlb_vma_lock-to-private-vmas.patch
hugetlbfs-close-race-between-madv_dontneed-and-page-fault.patch
The patch titled
Subject: hugetlbfs: extend hugetlb_vma_lock to private VMAs
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
hugetlbfs-extend-hugetlb_vma_lock-to-private-vmas.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Rik van Riel <riel(a)surriel.com>
Subject: hugetlbfs: extend hugetlb_vma_lock to private VMAs
Date: Thu, 5 Oct 2023 23:59:07 -0400
Extend the locking scheme used to protect shared hugetlb mappings from
truncate vs page fault races, in order to protect private hugetlb mappings
(with resv_map) against MADV_DONTNEED.
Add a read-write semaphore to the resv_map data structure, and use that
from the hugetlb_vma_(un)lock_* functions, in preparation for closing the
race between MADV_DONTNEED and page faults.
Link: https://lkml.kernel.org/r/20231006040020.3677377-3-riel@surriel.com
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/hugetlb.h | 6 +++++
mm/hugetlb.c | 41 ++++++++++++++++++++++++++++++++++----
2 files changed, 43 insertions(+), 4 deletions(-)
--- a/include/linux/hugetlb.h~hugetlbfs-extend-hugetlb_vma_lock-to-private-vmas
+++ a/include/linux/hugetlb.h
@@ -60,6 +60,7 @@ struct resv_map {
long adds_in_progress;
struct list_head region_cache;
long region_cache_count;
+ struct rw_semaphore rw_sema;
#ifdef CONFIG_CGROUP_HUGETLB
/*
* On private mappings, the counter to uncharge reservations is stored
@@ -1233,6 +1234,11 @@ static inline bool __vma_shareable_lock(
return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}
+static inline bool __vma_private_lock(struct vm_area_struct *vma)
+{
+ return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
+}
+
/*
* Safe version of huge_pte_offset() to check the locks. See comments
* above huge_pte_offset().
--- a/mm/hugetlb.c~hugetlbfs-extend-hugetlb_vma_lock-to-private-vmas
+++ a/mm/hugetlb.c
@@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struc
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_are
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
down_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_read(&resv_map->rw_sema);
}
}
@@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_a
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
up_read(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_read(&resv_map->rw_sema);
}
}
@@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_ar
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
down_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ down_write(&resv_map->rw_sema);
}
}
@@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
up_write(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ up_write(&resv_map->rw_sema);
}
}
int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
- struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
- if (!__vma_shareable_lock(vma))
- return 1;
+ if (__vma_shareable_lock(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
- return down_write_trylock(&vma_lock->rw_sema);
+ return down_write_trylock(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ return down_write_trylock(&resv_map->rw_sema);
+ }
+
+ return 1;
}
void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
@@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
lockdep_assert_held(&vma_lock->rw_sema);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ lockdep_assert_held(&resv_map->rw_sema);
}
}
@@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_f
struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
__hugetlb_vma_unlock_write_put(vma_lock);
+ } else if (__vma_private_lock(vma)) {
+ struct resv_map *resv_map = vma_resv_map(vma);
+
+ /* no free for anon vmas, but still need to unlock */
+ up_write(&resv_map->rw_sema);
}
}
@@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void)
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ init_rwsem(&resv_map->rw_sema);
resv_map->adds_in_progress = 0;
/*
_
Patches currently in -mm which might be from riel(a)surriel.com are
hugetlbfs-clear-resv_map-pointer-if-mmap-fails.patch
hugetlbfs-extend-hugetlb_vma_lock-to-private-vmas.patch
hugetlbfs-close-race-between-madv_dontneed-and-page-fault.patch
The patch titled
Subject: mm: zswap: fix pool refcount bug around shrink_worker()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-zswap-fix-pool-refcount-bug-around-shrink_worker.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Johannes Weiner <hannes(a)cmpxchg.org>
Subject: mm: zswap: fix pool refcount bug around shrink_worker()
Date: Fri, 6 Oct 2023 12:00:24 -0400
When a zswap store fails due to the limit, it acquires a pool reference
and queues the shrinker. When the shrinker runs, it drops the reference.
However, there can be multiple store attempts before the shrinker wakes up
and runs once. This results in reference leaks and eventual saturation
warnings for the pool refcount.
Fix this by dropping the reference again when the shrinker is already
queued. This ensures one reference per shrinker run.
Link: https://lkml.kernel.org/r/20231006160024.170748-1-hannes@cmpxchg.org
Fixes: 45190f01dd40 ("mm/zswap.c: add allocation hysteresis if pool limit is hit")
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reported-by: Chris Mason <clm(a)fb.com>
Cc: Vitaly Wool <vitaly.wool(a)konsulko.com>
Cc: Domenico Cerasuolo <cerasuolodomenico(a)gmail.com>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: <stable(a)vger.kernel.org> [5.6+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zswap.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/zswap.c~mm-zswap-fix-pool-refcount-bug-around-shrink_worker
+++ a/mm/zswap.c
@@ -1383,8 +1383,8 @@ reject:
shrink:
pool = zswap_pool_last_get();
- if (pool)
- queue_work(shrink_wq, &pool->shrink_work);
+ if (pool && !queue_work(shrink_wq, &pool->shrink_work))
+ zswap_pool_put(pool);
goto reject;
}
_
Patches currently in -mm which might be from hannes(a)cmpxchg.org are
mm-zswap-fix-pool-refcount-bug-around-shrink_worker.patch
The function ext4_init_acl() calls posix_acl_create() which is
responsible for applying the umask. But without
CONFIG_EXT4_FS_POSIX_ACL, ext4_init_acl() is an empty inline function,
and nobody applies the umask.
This fixes a bug which causes the umask to be ignored with O_TMPFILE
on ext4:
https://github.com/MusicPlayerDaemon/MPD/issues/558https://bugs.gentoo.org/show_bug.cgi?id=686142#c3https://bugzilla.kernel.org/show_bug.cgi?id=203625
Reviewed-by: J. Bruce Fields <bfields(a)redhat.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Max Kellermann <max.kellermann(a)ionos.com>
---
fs/ext4/acl.h | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 0c5a79c3b5d4..ef4c19e5f570 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
+ /* usually, the umask is applied by posix_acl_create(), but if
+ ext4 ACL support is disabled at compile time, we need to do
+ it here, because posix_acl_create() will never be called */
+ inode->i_mode &= ~current_umask();
+
return 0;
}
#endif /* CONFIG_EXT4_FS_POSIX_ACL */
--
2.39.2
We got a WARNING in ext4_add_complete_io:
==================================================================
WARNING: at fs/ext4/page-io.c:231 ext4_put_io_end_defer+0x182/0x250
CPU: 10 PID: 77 Comm: ksoftirqd/10 Tainted: 6.3.0-rc2 #85
RIP: 0010:ext4_put_io_end_defer+0x182/0x250 [ext4]
[...]
Call Trace:
<TASK>
ext4_end_bio+0xa8/0x240 [ext4]
bio_endio+0x195/0x310
blk_update_request+0x184/0x770
scsi_end_request+0x2f/0x240
scsi_io_completion+0x75/0x450
scsi_finish_command+0xef/0x160
scsi_complete+0xa3/0x180
blk_complete_reqs+0x60/0x80
blk_done_softirq+0x25/0x40
__do_softirq+0x119/0x4c8
run_ksoftirqd+0x42/0x70
smpboot_thread_fn+0x136/0x3c0
kthread+0x140/0x1a0
ret_from_fork+0x2c/0x50
==================================================================
Above issue may happen as follows:
cpu1 cpu2
----------------------------|----------------------------
mount -o dioread_lock
ext4_writepages
ext4_do_writepages
*if (ext4_should_dioread_nolock(inode))*
// rsv_blocks is not assigned here
mount -o remount,dioread_nolock
ext4_journal_start_with_reserve
__ext4_journal_start
__ext4_journal_start_sb
jbd2__journal_start
*if (rsv_blocks)*
// h_rsv_handle is not initialized here
mpage_map_and_submit_extent
mpage_map_one_extent
dioread_nolock = ext4_should_dioread_nolock(inode)
if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN))
mpd->io_submit.io_end->handle = handle->h_rsv_handle
ext4_set_io_unwritten_flag
io_end->flag |= EXT4_IO_END_UNWRITTEN
// now io_end->handle is NULL but has EXT4_IO_END_UNWRITTEN flag
scsi_finish_command
scsi_io_completion
scsi_io_completion_action
scsi_end_request
blk_update_request
req_bio_endio
bio_endio
bio->bi_end_io > ext4_end_bio
ext4_put_io_end_defer
ext4_add_complete_io
// trigger WARN_ON(!io_end->handle && sbi->s_journal);
The immediate cause of this problem is that ext4_should_dioread_nolock()
function returns inconsistent values in the ext4_do_writepages() and
mpage_map_one_extent(). There are four conditions in this function that
can be changed at mount time to cause this problem. These four conditions
can be divided into two categories:
(1) journal_data and EXT4_EXTENTS_FL, which can be changed by ioctl
(2) DELALLOC and DIOREAD_NOLOCK, which can be changed by remount
The two in the first category have been fixed by commit c8585c6fcaf2
("ext4: fix races between changing inode journal mode and ext4_writepages")
and commit cb85f4d23f79 ("ext4: fix race between writepages and enabling
EXT4_EXTENTS_FL") respectively.
Two cases in the other category have not yet been fixed, and the above
issue is caused by this situation. We refer to the fix for the first
category, when applying options during remount, we grab s_writepages_rwsem
to avoid racing with writepages ops to trigger this problem.
Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io")
Cc: stable(a)vger.kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
V1->V2:
Grab s_writepages_rwsem unconditionally during remount.
Remove patches 1,2 that are no longer needed.
V2->V3:
Also grab s_writepages_rwsem when restoring options.
V3->V4:
Rebased on top of mainline.
Reference 00d873c17e29 ("ext4: avoid deadlock in fs reclaim with
page writeback") to use s_writepages_rwsem.
fs/ext4/ext4.h | 3 ++-
fs/ext4/super.c | 14 ++++++++++++++
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6948d673bba2..97ef99c7f296 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1613,7 +1613,8 @@ struct ext4_sb_info {
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
- * or EXTENTS flag.
+ * or EXTENTS flag or between writepages ops and changing DELALLOC or
+ * DIOREAD_NOLOCK mount options on remount.
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9680fe753e59..fff42682e4e0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6389,6 +6389,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
ext4_group_t g;
int err = 0;
int enable_rw = 0;
+ int alloc_ctx;
#ifdef CONFIG_QUOTA
int enable_quota = 0;
int i, j;
@@ -6429,7 +6430,16 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
}
+ /*
+ * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause
+ * two calls to ext4_should_dioread_nolock() to return inconsistent
+ * values, triggering WARN_ON in ext4_add_complete_io(). we grab
+ * here s_writepages_rwsem to avoid race between writepages ops and
+ * remount.
+ */
+ alloc_ctx = ext4_writepages_down_write(sb);
ext4_apply_options(fc, sb);
+ ext4_writepages_up_write(sb, alloc_ctx);
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
@@ -6650,6 +6660,8 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
+
+ alloc_ctx = ext4_writepages_down_write(sb);
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -6658,6 +6670,8 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
+ ext4_writepages_up_write(sb, alloc_ctx);
+
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks)
ext4_release_system_zone(sb);
#ifdef CONFIG_QUOTA
--
2.31.1
Direct calls to ops->connect() can overwrite the address parameter when
used in conjunction with BPF SOCK_ADDR hooks. Recent changes to
kernel_connect() ensure that callers are insulated from such side
effects. This patch wraps the direct call to ops->connect() with
kernel_connect() to prevent unexpected changes to the address passed to
ceph_tcp_connect().
This change was originally part of a larger patch targeting the net tree
addressing all instances of unprotected calls to ops->connect()
throughout the kernel, but this change was split up into several patches
targeting various trees.
Link: https://lore.kernel.org/netdev/20230821100007.559638-1-jrife@google.com/
Link: https://lore.kernel.org/netdev/9944248dba1bce861375fcce9de663934d933ba9.cam…
Fixes: d74bad4e74ee ("bpf: Hooks for sys_connect")
Cc: stable(a)vger.kernel.org
Signed-off-by: Jordan Rife <jrife(a)google.com>
---
net/ceph/messenger.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 10a41cd9c5235..3c8b78d9c4d1c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -459,8 +459,8 @@ int ceph_tcp_connect(struct ceph_connection *con)
set_sock_callbacks(sock, con);
con_sock_state_connecting(con);
- ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
- O_NONBLOCK);
+ ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss),
+ O_NONBLOCK);
if (ret == -EINPROGRESS) {
dout("connect %s EINPROGRESS sk_state = %u\n",
ceph_pr_addr(&con->peer_addr),
--
2.42.0.582.g8ccd20d70d-goog
Hello There,
May I know if you are interested in any of these services?
Please let me know if you are interested in Re-Design and Development
(HTML, WordPress) or Ranking your Website on Top Page in Google so that we
can send you more information regarding the same.
Make this first impression count with a professional and interesting
website that clearly communicates your brand and what it stands for.
Revert on this mail with an example of a reference website with special
features which you want on your site.
Thanks & Regards
Anjali
This is a note to let you know that I've just added the patch titled
iio: pressure: ms5611: ms5611_prom_is_valid false negative bug
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From fd39d9668f2ce9f4b05ad55e8c8d80c098073e0b Mon Sep 17 00:00:00 2001
From: Alexander Zangerl <az(a)breathe-safe.com>
Date: Wed, 20 Sep 2023 10:01:10 +1000
Subject: iio: pressure: ms5611: ms5611_prom_is_valid false negative bug
The ms5611 driver falsely rejects lots of MS5607-02BA03-50 chips
with "PROM integrity check failed" because it doesn't accept a prom crc
value of zero as legitimate.
According to the datasheet for this chip (and the manufacturer's
application note about the PROM CRC), none of the possible values for the
CRC are excluded - but the current code in ms5611_prom_is_valid() ends with
return crc_orig != 0x0000 && crc == crc_orig
Discussed with the driver author (Tomasz Duszynski) and he indicated that
at that time (2015) he was dealing with some faulty chip samples which
returned blank data under some circumstances and/or followed example code
which indicated CRC zero being bad.
As far as I can tell this exception should not be applied anymore; We've
got a few hundred custom boards here with this chip where large numbers
of the prom have a legitimate CRC value 0, and do work fine, but which the
current driver code wrongly rejects.
Signed-off-by: Alexander Zangerl <az(a)breathe-safe.com>
Fixes: c0644160a8b5 ("iio: pressure: add support for MS5611 pressure and temperature sensor")
Link: https://lore.kernel.org/r/2535-1695168070.831792@Ze3y.dhYT.s3fx
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/pressure/ms5611_core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/iio/pressure/ms5611_core.c b/drivers/iio/pressure/ms5611_core.c
index 627497e61a63..2fc706f9d8ae 100644
--- a/drivers/iio/pressure/ms5611_core.c
+++ b/drivers/iio/pressure/ms5611_core.c
@@ -76,7 +76,7 @@ static bool ms5611_prom_is_valid(u16 *prom, size_t len)
crc = (crc >> 12) & 0x000F;
- return crc_orig != 0x0000 && crc == crc_orig;
+ return crc == crc_orig;
}
static int ms5611_read_prom(struct iio_dev *indio_dev)
--
2.42.0
This is a note to let you know that I've just added the patch titled
iio: light: vcnl4000: Don't power on/off chip in config
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 7e87ab38eed09c9dec56da361d74158159ae84a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5rten=20Lindahl?= <marten.lindahl(a)axis.com>
Date: Mon, 11 Sep 2023 14:43:01 +0200
Subject: iio: light: vcnl4000: Don't power on/off chip in config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
After enabling/disabling interrupts on the vcnl4040 chip the als and/or
ps sensor is powered on or off depending on the interrupt enable bits.
This is made as a last step in write_event_config.
But there is no reason to do this as the runtime PM handles the power
state of the sensors. Interfering with this may impact sensor readings.
Consider the following:
1. Userspace makes sensor data reading which triggers RPM resume
(sensor powered on) and a RPM suspend timeout. The timeout is 2000ms
before RPM suspend powers the sensor off if no new reading is made
within the timeout period.
2. Userspace disables interrupts => powers sensor off
3. Userspace reads sensor data = 0 because sensor is off and the
suspend timeout has not passed. For each new reading made within the
timeout period the timeout is renewed with 2000ms and RPM will not
make a new resume (device was not suspended). So the sensor will
not be powered on.
4. No further userspace reading for 2000ms ends RPM suspend timeout and
triggers suspend (powers off already powered off sensor).
Powering sensor off in (2) makes all consecutive readings made within
2000ms to the previous reading (3) return invalid data.
Skip setting power state when writing new event config.
Fixes: 546676121cb9 ("iio: light: vcnl4000: Add interrupt support for vcnl4040")
Fixes: bc292aaf9cb4 ("iio: light: vcnl4000: add illuminance irq vcnl4040/4200")
Signed-off-by: Mårten Lindahl <marten.lindahl(a)axis.com>
Link: https://lore.kernel.org/r/20230907-vcnl4000-pm-fix-v2-1-298e01f54db4@axis.c…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/light/vcnl4000.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/iio/light/vcnl4000.c b/drivers/iio/light/vcnl4000.c
index 3a52b09c2823..fdf763a04b0b 100644
--- a/drivers/iio/light/vcnl4000.c
+++ b/drivers/iio/light/vcnl4000.c
@@ -1513,7 +1513,6 @@ static int vcnl4040_write_event_config(struct iio_dev *indio_dev,
out:
mutex_unlock(&data->vcnl4000_lock);
- data->chip_spec->set_power_state(data, data->ps_int || data->als_int);
return ret;
}
--
2.42.0
This is a note to let you know that I've just added the patch titled
iio: addac: Kconfig: update ad74413r selections
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From b120dd3a15582fb7a959cecb05e4d9814fcba386 Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus(a)analog.com>
Date: Tue, 12 Sep 2023 11:54:21 +0300
Subject: iio: addac: Kconfig: update ad74413r selections
Building ad74413r without selecting IIO_BUFFER and
IIO_TRIGGERED_BUFFER generates error with respect to the iio trigger
functions that are used within the driver.
Update the Kconfig accordingly.
Fixes: fea251b6a5db ("iio: addac: add AD74413R driver")
Signed-off-by: Antoniu Miclaus <antoniu.miclaus(a)analog.com>
Link: https://lore.kernel.org/r/20230912085421.51102-1-antoniu.miclaus@analog.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/addac/Kconfig | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/iio/addac/Kconfig b/drivers/iio/addac/Kconfig
index 877f9124803c..397544f23b85 100644
--- a/drivers/iio/addac/Kconfig
+++ b/drivers/iio/addac/Kconfig
@@ -24,6 +24,8 @@ config AD74413R
depends on GPIOLIB && SPI
select REGMAP_SPI
select CRC8
+ select IIO_BUFFER
+ select IIO_TRIGGERED_BUFFER
help
Say yes here to build support for Analog Devices AD74412R/AD74413R
quad-channel software configurable input/output solution.
--
2.42.0
This is a note to let you know that I've just added the patch titled
iio: pressure: dps310: Adjust Timeout Settings
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 901a293fd96fb9bab843ba4cc7be3094a5aa7c94 Mon Sep 17 00:00:00 2001
From: Lakshmi Yadlapati <lakshmiy(a)us.ibm.com>
Date: Tue, 29 Aug 2023 13:02:22 -0500
Subject: iio: pressure: dps310: Adjust Timeout Settings
The DPS310 sensor chip has been encountering intermittent errors while
reading the sensor device across various system designs. This issue causes
the chip to become "stuck," preventing the indication of "ready" status
for pressure and temperature measurements in the MEAS_CFG register.
To address this issue, this commit fixes the timeout settings to improve
sensor stability:
- After sending a reset command to the chip, the timeout has been extended
from 2.5 ms to 15 ms, aligning with the DPS310 specification.
- The read timeout value of the MEAS_CFG register has been adjusted from
20ms to 30ms to match the specification.
Signed-off-by: Lakshmi Yadlapati <lakshmiy(a)us.ibm.com>
Fixes: 7b4ab4abcea4 ("iio: pressure: dps310: Reset chip after timeout")
Link: https://lore.kernel.org/r/20230829180222.3431926-2-lakshmiy@us.ibm.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/pressure/dps310.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/iio/pressure/dps310.c b/drivers/iio/pressure/dps310.c
index b10dbf5cf494..1ff091b2f764 100644
--- a/drivers/iio/pressure/dps310.c
+++ b/drivers/iio/pressure/dps310.c
@@ -57,8 +57,8 @@
#define DPS310_RESET_MAGIC 0x09
#define DPS310_COEF_BASE 0x10
-/* Make sure sleep time is <= 20ms for usleep_range */
-#define DPS310_POLL_SLEEP_US(t) min(20000, (t) / 8)
+/* Make sure sleep time is <= 30ms for usleep_range */
+#define DPS310_POLL_SLEEP_US(t) min(30000, (t) / 8)
/* Silently handle error in rate value here */
#define DPS310_POLL_TIMEOUT_US(rc) ((rc) <= 0 ? 1000000 : 1000000 / (rc))
@@ -402,8 +402,8 @@ static int dps310_reset_wait(struct dps310_data *data)
if (rc)
return rc;
- /* Wait for device chip access: 2.5ms in specification */
- usleep_range(2500, 12000);
+ /* Wait for device chip access: 15ms in specification */
+ usleep_range(15000, 55000);
return 0;
}
--
2.42.0
This is a note to let you know that I've just added the patch titled
iio: imu: bno055: Fix missing Kconfig dependencies
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From c9b9cfe7d342683f624a89c3b617be18aff879e8 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Date: Sun, 3 Sep 2023 12:30:52 +0100
Subject: iio: imu: bno055: Fix missing Kconfig dependencies
This driver uses IIO triggered buffers so it needs to select them in
Kconfig.
on riscv-32bit:
/opt/crosstool/gcc-13.2.0-nolibc/riscv32-linux/bin/riscv32-linux-ld: drivers/iio/imu/bno055/bno055.o: in function `.L367':
bno055.c:(.text+0x2c96): undefined reference to `devm_iio_triggered_buffer_setup_ext'
Reported-by: Randy Dunlap <rdunlap(a)infradead.org>
Closes: https://lore.kernel.org/linux-next/40566b4b-3950-81fe-ff14-871d8c447627@inf…
Fixes: 4aefe1c2bd0c ("iio: imu: add Bosch Sensortec BNO055 core driver")
Cc: Andrea Merello <andrea.merello(a)iit.it>
Acked-by: Randy Dunlap <rdunlap(a)infradead.org>
Tested-by: Randy Dunlap <rdunlap(a)infradead.org>
Link: https://lore.kernel.org/r/20230903113052.846298-1-jic23@kernel.org
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/imu/bno055/Kconfig | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/iio/imu/bno055/Kconfig b/drivers/iio/imu/bno055/Kconfig
index fa79b1ac4f85..83e53acfbe88 100644
--- a/drivers/iio/imu/bno055/Kconfig
+++ b/drivers/iio/imu/bno055/Kconfig
@@ -2,6 +2,8 @@
config BOSCH_BNO055
tristate
+ select IIO_BUFFER
+ select IIO_TRIGGERED_BUFFER
config BOSCH_BNO055_SERIAL
tristate "Bosch BNO055 attached via UART"
--
2.42.0
This is a note to let you know that I've just added the patch titled
iio: adc: imx8qxp: Fix address for command buffer registers
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 850101b3598277794f92a9e363a60a66e0d42890 Mon Sep 17 00:00:00 2001
From: Philipp Rossak <embed3d(a)gmail.com>
Date: Tue, 5 Sep 2023 00:02:04 +0200
Subject: iio: adc: imx8qxp: Fix address for command buffer registers
The ADC Command Buffer Register high and low are currently pointing to
the wrong address and makes it impossible to perform correct
ADC measurements over all channels.
According to the datasheet of the imx8qxp the ADC_CMDL register starts
at address 0x100 and the ADC_CMDH register starts at address 0x104.
This bug seems to be in the kernel since the introduction of this
driver.
This can be observed by checking all raw voltages of the adc and they
are all nearly identical:
cat /sys/bus/iio/devices/iio\:device0/in_voltage*_raw
3498
3494
3491
3491
3489
3490
3490
3490
Fixes: 1e23dcaa1a9fa ("iio: imx8qxp-adc: Add driver support for NXP IMX8QXP ADC")
Signed-off-by: Philipp Rossak <embed3d(a)gmail.com>
Acked-by: Haibo Chen <haibo.chen(a)nxp.com>
Link: https://lore.kernel.org/r/20230904220204.23841-1-embed3d@gmail.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/adc/imx8qxp-adc.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/iio/adc/imx8qxp-adc.c b/drivers/iio/adc/imx8qxp-adc.c
index f5a0fc9e64c5..fff6e5a2d956 100644
--- a/drivers/iio/adc/imx8qxp-adc.c
+++ b/drivers/iio/adc/imx8qxp-adc.c
@@ -38,8 +38,8 @@
#define IMX8QXP_ADR_ADC_FCTRL 0x30
#define IMX8QXP_ADR_ADC_SWTRIG 0x34
#define IMX8QXP_ADR_ADC_TCTRL(tid) (0xc0 + (tid) * 4)
-#define IMX8QXP_ADR_ADC_CMDH(cid) (0x100 + (cid) * 8)
-#define IMX8QXP_ADR_ADC_CMDL(cid) (0x104 + (cid) * 8)
+#define IMX8QXP_ADR_ADC_CMDL(cid) (0x100 + (cid) * 8)
+#define IMX8QXP_ADR_ADC_CMDH(cid) (0x104 + (cid) * 8)
#define IMX8QXP_ADR_ADC_RESFIFO 0x300
#define IMX8QXP_ADR_ADC_TST 0xffc
--
2.42.0
This is a note to let you know that I've just added the patch titled
iio: admv1013: add mixer_vgate corner cases
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 287d998af24326b009ae0956820a3188501b34a0 Mon Sep 17 00:00:00 2001
From: Antoniu Miclaus <antoniu.miclaus(a)analog.com>
Date: Mon, 7 Aug 2023 17:38:05 +0300
Subject: iio: admv1013: add mixer_vgate corner cases
Include the corner cases in the computation of the MIXER_VGATE register
value.
According to the datasheet: The MIXER_VGATE values follows the VCM such
as, that for a 0V to 1.8V VCM, MIXER_VGATE = 23.89 VCM + 81, and for a >
1.8V to 2.6V VCM, MIXER_VGATE = 23.75 VCM + 1.25.
Fixes: da35a7b526d9 ("iio: frequency: admv1013: add support for ADMV1013")
Signed-off-by: Antoniu Miclaus <antoniu.miclaus(a)analog.com>
Reviewed-by: Nuno Sa <nuno.sa(a)analog.com>
Link: https://lore.kernel.org/r/20230807143806.6954-1-antoniu.miclaus@analog.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/frequency/admv1013.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/iio/frequency/admv1013.c b/drivers/iio/frequency/admv1013.c
index 6355c1f28423..92923074f930 100644
--- a/drivers/iio/frequency/admv1013.c
+++ b/drivers/iio/frequency/admv1013.c
@@ -351,9 +351,9 @@ static int admv1013_update_mixer_vgate(struct admv1013_state *st)
if (vcm < 0)
return vcm;
- if (vcm < 1800000)
+ if (vcm <= 1800000)
mixer_vgate = (2389 * vcm / 1000000 + 8100) / 100;
- else if (vcm > 1800000 && vcm < 2600000)
+ else if (vcm > 1800000 && vcm <= 2600000)
mixer_vgate = (2375 * vcm / 1000000 + 125) / 100;
else
return -EINVAL;
--
2.42.0
This is a note to let you know that I've just added the patch titled
iio: dac: ad3552r: Correct device IDs
to my char-misc git tree which can be found at
git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
in the char-misc-linus branch.
The patch will show up in the next release of the linux-next tree
(usually sometime within the next 24 hours during the week.)
The patch will hopefully also be merged in Linus's tree for the
next -rc kernel release.
If you have any questions about this process, please let me know.
From 9a85653ed3b9a9b7b31d95a34b64b990c3d33ca1 Mon Sep 17 00:00:00 2001
From: Marcelo Schmitt <marcelo.schmitt1(a)gmail.com>
Date: Thu, 3 Aug 2023 16:56:23 -0300
Subject: iio: dac: ad3552r: Correct device IDs
Device IDs for AD3542R and AD3552R were swapped leading to unintended
collection of DAC output ranges being used for each design.
Change device ID values so they are correct for each DAC chip.
Fixes: 8f2b54824b28 ("drivers:iio:dac: Add AD3552R driver support")
Signed-off-by: Marcelo Schmitt <marcelo.schmitt1(a)gmail.com>
Reported-by: Chandrakant Minajigi <Chandrakant.Minajigi(a)analog.com>
Link: https://lore.kernel.org/r/011f480220799fbfabdd53896f8a2f251ad995ad.16910913…
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
---
drivers/iio/dac/ad3552r.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/iio/dac/ad3552r.c b/drivers/iio/dac/ad3552r.c
index d5ea1a1be122..a492e8f2fc0f 100644
--- a/drivers/iio/dac/ad3552r.c
+++ b/drivers/iio/dac/ad3552r.c
@@ -140,8 +140,8 @@ enum ad3552r_ch_vref_select {
};
enum ad3542r_id {
- AD3542R_ID = 0x4008,
- AD3552R_ID = 0x4009,
+ AD3542R_ID = 0x4009,
+ AD3552R_ID = 0x4008,
};
enum ad3552r_ch_output_range {
--
2.42.0
hidpp_connect_event() has *four* time-of-check vs time-of-use (TOCTOU)
races when it races with itself.
hidpp_connect_event() primarily runs from a workqueue but it also runs
on probe() and if a "device-connected" packet is received by the hw
when the thread running hidpp_connect_event() from probe() is waiting on
the hw, then a second thread running hidpp_connect_event() will be
started from the workqueue.
This opens the following races (note the below code is simplified):
1. Retrieving + printing the protocol (harmless race):
if (!hidpp->protocol_major) {
hidpp_root_get_protocol_version()
hidpp->protocol_major = response.rap.params[0];
}
We can actually see this race hit in the dmesg in the abrt output
attached to rhbz#2227968:
[ 3064.624215] logitech-hidpp-device 0003:046D:4071.0049: HID++ 4.5 device connected.
[ 3064.658184] logitech-hidpp-device 0003:046D:4071.0049: HID++ 4.5 device connected.
Testing with extra logging added has shown that after this the 2 threads
take turn grabbing the hw access mutex (send_mutex) so they ping-pong
through all the other TOCTOU cases managing to hit all of them:
2. Updating the name to the HIDPP name (harmless race):
if (hidpp->name == hdev->name) {
...
hidpp->name = new_name;
}
3. Initializing the power_supply class for the battery (problematic!):
hidpp_initialize_battery()
{
if (hidpp->battery.ps)
return 0;
probe_battery(); /* Blocks, threads take turns executing this */
hidpp->battery.desc.properties =
devm_kmemdup(dev, hidpp_battery_props, cnt, GFP_KERNEL);
hidpp->battery.ps =
devm_power_supply_register(&hidpp->hid_dev->dev,
&hidpp->battery.desc, cfg);
}
4. Creating delayed input_device (potentially problematic):
if (hidpp->delayed_input)
return;
hidpp->delayed_input = hidpp_allocate_input(hdev);
The really big problem here is 3. Hitting the race leads to the following
sequence:
hidpp->battery.desc.properties =
devm_kmemdup(dev, hidpp_battery_props, cnt, GFP_KERNEL);
hidpp->battery.ps =
devm_power_supply_register(&hidpp->hid_dev->dev,
&hidpp->battery.desc, cfg);
...
hidpp->battery.desc.properties =
devm_kmemdup(dev, hidpp_battery_props, cnt, GFP_KERNEL);
hidpp->battery.ps =
devm_power_supply_register(&hidpp->hid_dev->dev,
&hidpp->battery.desc, cfg);
So now we have registered 2 power supplies for the same battery,
which looks a bit weird from userspace's pov but this is not even
the really big problem.
Notice how:
1. This is all devm-maganaged
2. The hidpp->battery.desc struct is shared between the 2 power supplies
3. hidpp->battery.desc points to the result from the second devm_kmemdup()
This causes a use after free scenario on USB disconnect of the receiver:
1. The last registered power supply class device gets unregistered
2. The memory from the last devm_kmemdup() call gets freed,
hidpp->battery.desc.properties now points to freed memory
3. The first registered power supply class device gets unregistered,
this involves sending a remove uevent to userspace which invokes
power_supply_uevent() to fill the uevent data
4. power_supply_uevent() uses hidpp->battery.desc.properties which
now points to freed memory leading to backtraces like this one:
Sep 22 20:01:35 eric kernel: BUG: unable to handle page fault for address: ffffb2140e017f08
...
Sep 22 20:01:35 eric kernel: Workqueue: usb_hub_wq hub_event
Sep 22 20:01:35 eric kernel: RIP: 0010:power_supply_uevent+0xee/0x1d0
...
Sep 22 20:01:35 eric kernel: ? asm_exc_page_fault+0x26/0x30
Sep 22 20:01:35 eric kernel: ? power_supply_uevent+0xee/0x1d0
Sep 22 20:01:35 eric kernel: ? power_supply_uevent+0x10d/0x1d0
Sep 22 20:01:35 eric kernel: dev_uevent+0x10f/0x2d0
Sep 22 20:01:35 eric kernel: kobject_uevent_env+0x291/0x680
Sep 22 20:01:35 eric kernel: power_supply_unregister+0x8e/0xa0
Sep 22 20:01:35 eric kernel: release_nodes+0x3d/0xb0
Sep 22 20:01:35 eric kernel: devres_release_group+0xfc/0x130
Sep 22 20:01:35 eric kernel: hid_device_remove+0x56/0xa0
Sep 22 20:01:35 eric kernel: device_release_driver_internal+0x19f/0x200
Sep 22 20:01:35 eric kernel: bus_remove_device+0xc6/0x130
Sep 22 20:01:35 eric kernel: device_del+0x15c/0x3f0
Sep 22 20:01:35 eric kernel: ? __queue_work+0x1df/0x440
Sep 22 20:01:35 eric kernel: hid_destroy_device+0x4b/0x60
Sep 22 20:01:35 eric kernel: logi_dj_remove+0x9a/0x100 [hid_logitech_dj 5c91534a0ead2b65e04dd799a0437e3b99b21bc4]
Sep 22 20:01:35 eric kernel: hid_device_remove+0x44/0xa0
Sep 22 20:01:35 eric kernel: device_release_driver_internal+0x19f/0x200
Sep 22 20:01:35 eric kernel: bus_remove_device+0xc6/0x130
Sep 22 20:01:35 eric kernel: device_del+0x15c/0x3f0
Sep 22 20:01:35 eric kernel: ? __queue_work+0x1df/0x440
Sep 22 20:01:35 eric kernel: hid_destroy_device+0x4b/0x60
Sep 22 20:01:35 eric kernel: usbhid_disconnect+0x47/0x60 [usbhid 727dcc1c0b94e6b4418727a468398ac3bca492f3]
Sep 22 20:01:35 eric kernel: usb_unbind_interface+0x90/0x270
Sep 22 20:01:35 eric kernel: device_release_driver_internal+0x19f/0x200
Sep 22 20:01:35 eric kernel: bus_remove_device+0xc6/0x130
Sep 22 20:01:35 eric kernel: device_del+0x15c/0x3f0
Sep 22 20:01:35 eric kernel: ? kobject_put+0xa0/0x1d0
Sep 22 20:01:35 eric kernel: usb_disable_device+0xcd/0x1e0
Sep 22 20:01:35 eric kernel: usb_disconnect+0xde/0x2c0
Sep 22 20:01:35 eric kernel: usb_disconnect+0xc3/0x2c0
Sep 22 20:01:35 eric kernel: hub_event+0xe80/0x1c10
There have been quite a few bug reports (see Link tags) about this crash.
Fix all the TOCTOU issues, including the really bad power-supply related
system crash on USB disconnect, by making probe() use the workqueue for
running hidpp_connect_event() too, so that it can never run more then once.
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2227221
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2227968
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2227968
Link: https://bugzilla.redhat.com/show_bug.cgi?id=2242189
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217412#c58
Cc: stable(a)vger.kernel.org
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
---
drivers/hid/hid-logitech-hidpp.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/hid/hid-logitech-hidpp.c b/drivers/hid/hid-logitech-hidpp.c
index ff077df0babf..a209d51bd247 100644
--- a/drivers/hid/hid-logitech-hidpp.c
+++ b/drivers/hid/hid-logitech-hidpp.c
@@ -4515,7 +4515,8 @@ static int hidpp_probe(struct hid_device *hdev, const struct hid_device_id *id)
goto hid_hw_init_fail;
}
- hidpp_connect_event(hidpp);
+ schedule_work(&hidpp->work);
+ flush_work(&hidpp->work);
if (will_restart) {
/* Reset the HID node state */
--
2.41.0
Hello,
Patch 1/4 needed a trivial adjustment, the rest were taken verbatim and
included here just for convenience.
Thanks,
Ilya
Ilya Dryomov (4):
rbd: move rbd_dev_refresh() definition
rbd: decouple header read-in from updating rbd_dev->header
rbd: decouple parent info read-in from updating rbd_dev
rbd: take header_rwsem in rbd_dev_refresh() only when updating
drivers/block/rbd.c | 412 ++++++++++++++++++++++++--------------------
1 file changed, 225 insertions(+), 187 deletions(-)
--
2.41.0
Hi Stable Kernel Team,
Can this patch be backported to v6.1+ kernels? The commit id is
5b4a82a0724a and has been upstream since v6.5. As was mentioned in the
original patch description (below), the commit being reverted by this
patch breaks state recovery in a way that is worse than the initial
bug that it was attempting to fix.
Thanks,
Anna
On Tue, Jun 27, 2023 at 2:31 PM Benjamin Coddington <bcodding(a)redhat.com> wrote:
>
> Olga Kornievskaia reports that this patch breaks NFSv4.0 state recovery.
> It also introduces additional complexity in the error paths for cases not
> related to the original problem. Let's revert it for now, and address the
> original problem in another manner.
>
> This reverts commit f5ea16137a3fa2858620dc9084466491c128535f.
>
> Fixes: f5ea16137a3f ("NFSv4: Retry LOCK on OLD_STATEID during delegation return")
> Reported-by: Kornievskaia, Olga <Olga.Kornievskaia(a)netapp.com>
> Signed-off-by: Benjamin Coddington <bcodding(a)redhat.com>
> ---
> fs/nfs/nfs4proc.c | 6 ++----
> 1 file changed, 2 insertions(+), 4 deletions(-)
>
> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
> index d3665390c4cb..6bb14f6cfbc0 100644
> --- a/fs/nfs/nfs4proc.c
> +++ b/fs/nfs/nfs4proc.c
> @@ -7159,7 +7159,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
> {
> struct nfs4_lockdata *data = calldata;
> struct nfs4_lock_state *lsp = data->lsp;
> - struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
>
> if (!nfs4_sequence_done(task, &data->res.seq_res))
> return;
> @@ -7167,7 +7166,8 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
> data->rpc_status = task->tk_status;
> switch (task->tk_status) {
> case 0:
> - renew_lease(server, data->timestamp);
> + renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
> + data->timestamp);
> if (data->arg.new_lock && !data->cancelled) {
> data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
> if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
> @@ -7188,8 +7188,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
> if (!nfs4_stateid_match(&data->arg.open_stateid,
> &lsp->ls_state->open_stateid))
> goto out_restart;
> - else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
> - goto out_restart;
> } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
> &lsp->ls_stateid))
> goto out_restart;
> --
> 2.40.1
>
Hi,
We found that commits 46bdb4277f98 and 0388f9c74330 (merged in v5.12)
introduced more than 40% performance regression in UnixBench's process
creation benchmark when comparing 5.10 with 5.15 and 6.1 on AWS'
virtualized Graviton instances.
This has been mostly fixed by the following upstream commit:
"""
commit a89c6bcdac22bec1bfbe6e64060b4cf5838d4f47
Author: Gabriel Krisman Bertazi <krisman(a)suse.de>
Date: Mon Jan 9 12:19:55 2023 -0300
arm64: Avoid repeated AA64MMFR1_EL1 register read on pagefault path
"""
(This is merged in v6.3 and applies cleanly in 5.15 and 6.1).
We performed functional and performance regression tests on both 5.15
and 6.1 with this patch applied. We can also observe 10% improvement
in system time in other benchmarks.
Just to be transparent, we initially observed 5% degradation in a
benchmark we have with fio randwrite on EBS volumes with our 6.1
downstream kernel but this doesn't seem reproducible and we don't
think this patch would cause it.
Here's some quick UnixBench process creation measurements on
m7gd.16xlarge instance type. All results are the average of 5
consecutive runs with a single process:
* 5.10.197: 9886.68 lps (baseline)
* 6.1.55: 5531.24 lps (unfixed - 44% degradation)
* 5.15.133: 5960.12 lps (unfixed - 39% degradation)
* 5.15.133: 10306.4 lps (fixed - actually better than baseline)
* 6.1.55: 9393.42 lps (fixed - degradation reduced to 5%)
- Luiz
Hello,
On Mon, 2 Oct 2023 13:54:28 +0300 Sagi Grimberg <sagi(a)grimberg.me> wrote:
> From Alon:
> "Due to a logical bug in the NVMe-oF/TCP subsystem in the Linux kernel,
> a malicious user can cause a UAF and a double free, which may lead to
> RCE (may also lead to an LPE in case the attacker already has local
> privileges)."
>
> Hence, when a queue initialization fails after the ahash requests are
> allocated, it is guaranteed that the queue removal async work will be
> called, hence leave the deallocation to the queue removal.
>
> Also, be extra careful not to continue processing the socket, so set
> queue rcv_state to NVMET_TCP_RECV_ERR upon a socket error.
>
> Reported-by: Alon Zahavi <zahavi.alon(a)gmail.com>
> Tested-by: Alon Zahavi <zahavi.alon(a)gmail.com>
> Signed-off-by: Sagi Grimberg <sagi(a)grimberg.me>
Would it be better to add Fixes: and Cc: stable lines?
Thanks,
SJ
> ---
> drivers/nvme/target/tcp.c | 7 ++-----
> 1 file changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 97d07488072d..d840f996eb82 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -372,6 +372,7 @@ static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
>
> static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
> {
> + queue->rcv_state = NVMET_TCP_RECV_ERR;
> if (status == -EPIPE || status == -ECONNRESET)
> kernel_sock_shutdown(queue->sock, SHUT_RDWR);
> else
> @@ -910,15 +911,11 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
> iov.iov_len = sizeof(*icresp);
> ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
> if (ret < 0)
> - goto free_crypto;
> + return ret; /* queue removal will cleanup */
>
> queue->state = NVMET_TCP_Q_LIVE;
> nvmet_prepare_receive_pdu(queue);
> return 0;
> -free_crypto:
> - if (queue->hdr_digest || queue->data_digest)
> - nvmet_tcp_free_crypto(queue);
> - return ret;
> }
>
> static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
> --
> 2.41.0
>
>
The MediaTek DRM driver implements GEM PRIME vmap by fetching the
sg_table for the object, iterating through the pages, and then
vmapping them. In essence, unlike the GEM DMA helpers which vmap
when the object is first created or imported, the MediaTek version
does it on request.
Unfortunately, the code never correctly frees the sg_table contents.
This results in a kernel memory leak. On a Hayato device with a text
console on the internal display, this results in the system running
out of memory in a few days from all the console screen cursor updates.
Add sg_free_table() to correctly free the contents of the sg_table. This
was missing despite explicitly required by mtk_gem_prime_get_sg_table().
Also move the "out" shortcut label to after the kfree() call for the
sg_table. Having sg_free_table() together with kfree() makes more sense.
The shortcut is only used when the object already has a kernel address,
in which case the pointer is NULL and kfree() does nothing. Hence this
change causes no functional change.
Fixes: 3df64d7b0a4f ("drm/mediatek: Implement gem prime vmap/vunmap function")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Chen-Yu Tsai <wenst(a)chromium.org>
---
Please merge for v6.6 fixes.
Also, I was wondering why the MediaTek DRM driver implements a lot of
the GEM functionality itself, instead of using the GEM DMA helpers.
From what I could tell, the code closely follows the DMA helpers, except
that it vmaps the buffers only upon request.
drivers/gpu/drm/mediatek/mtk_drm_gem.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
index 9f364df52478..0e0a41b2f57f 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
@@ -239,6 +239,7 @@ int mtk_drm_gem_prime_vmap(struct drm_gem_object *obj, struct iosys_map *map)
npages = obj->size >> PAGE_SHIFT;
mtk_gem->pages = kcalloc(npages, sizeof(*mtk_gem->pages), GFP_KERNEL);
if (!mtk_gem->pages) {
+ sg_free_table(sgt);
kfree(sgt);
return -ENOMEM;
}
@@ -248,12 +249,15 @@ int mtk_drm_gem_prime_vmap(struct drm_gem_object *obj, struct iosys_map *map)
mtk_gem->kvaddr = vmap(mtk_gem->pages, npages, VM_MAP,
pgprot_writecombine(PAGE_KERNEL));
if (!mtk_gem->kvaddr) {
+ sg_free_table(sgt);
kfree(sgt);
kfree(mtk_gem->pages);
return -ENOMEM;
}
-out:
+ sg_free_table(sgt);
kfree(sgt);
+
+out:
iosys_map_set_vaddr(map, mtk_gem->kvaddr);
return 0;
--
2.42.0.582.g8ccd20d70d-goog
Hello,
I have been experimenting this issue:
https://www.spinics.net/lists/linux-ext4/msg86259.html, on a 5.15
kernel.
This issue caused by 5c48a7df9149 ("ext4: fix an use-after-free issue
about data=journal writeback mode") is affecting ext4 users with
data=journal on all stable kernels.
Jan proposed a fix here
https://www.spinics.net/lists/linux-ext4/msg87054.html which solves the
situation for me.
Now this fix is not upstream because the data journaling support has
been rewritten. As suggested by Jan, that would mean that we could
either backport the following patches from upstream:
bd159398a2d2 ("jdb2: Don't refuse invalidation of already invalidated buffers")
d84c9ebdac1e ("ext4: Mark pages with journalled data dirty")
265e72efa99f ("ext4: Keep pages with journalled data dirty")
5e1bdea6391d ("ext4: Clear dirty bit from pages without data to write")
1f1a55f0bf06 ("ext4: Commit transaction before writing back pages in data=journal mode")
e360c6ed7274 ("ext4: Drop special handling of journalled data from ext4_sync_file()")
c000dfec7e88 ("ext4: Drop special handling of journalled data from extent shifting operations")
783ae448b7a2 ("ext4: Fix special handling of journalled data from extent zeroing")
56c2a0e3d90d ("ext4: Drop special handling of journalled data from ext4_evict_inode()")
7c375870fdc5 ("ext4: Drop special handling of journalled data from ext4_quota_on()")
951cafa6b80e ("ext4: Simplify handling of journalled data in ext4_bmap()")
ab382539adcb ("ext4: Update comment in mpage_prepare_extent_to_map()")
d0ab8368c175 ("Revert "ext4: Fix warnings when freezing filesystem with journaled data"")
1077b2d53ef5 ("ext4: fix fsync for non-directories")
Or apply the proposed, attached patch. Do you think that would be an
option?
Thanks,
Mathieu
An errata fix, and a fix for it
Robin Murphy (2):
iommu/arm-smmu-v3: Set TTL invalidation hint better
iommu/arm-smmu-v3: Avoid constructing invalid range commands
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)
--
2.34.1
An errata fix, and a fix for it
Robin Murphy (2):
iommu/arm-smmu-v3: Set TTL invalidation hint better
iommu/arm-smmu-v3: Avoid constructing invalid range commands
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)
--
2.34.1
Hello,
Please queue up this commit for 6.5-stable:
* commit: 88cc47e24597971b05b6e94c28a2fc81d2a8d61a
("perf build: Define YYNOMEM as YYNOABORT for bison < 3.81")
* Author: Arnaldo Carvalho de Melo <acme(a)redhat.com>
The recent change v6.5 series added YYNOMEM changes
in the perf tool and it caused a build failure on machines with
older bison. The above commit should be applied to fix it.
Thanks,
Namhyung
The GUCTL1.DEV_FORCE_20_CLK_FOR_30_CLK bit enable the feature of internal
2.0(utmi/ulpi) clock to be routed as the 3.0 (pipe) clock. This feature is
applicable when core is operating in 2.0 device mode.
When this bit is set in host mode and core is in 2.0 device mode (maximum
speed = high-speed) then usb super speed devices not detected on host.
To address the above issue added usb device mode conditional check.
Cc: stable(a)vger.kernel.org
Fixes: 62b20e6e0dde ("usb: dwc3: core: do not use 3.0 clock when operating in 2.0 mode")
Signed-off-by: Piyush Mehta <piyush.mehta(a)amd.com>
---
DWC3 Register Map Link:
https://docs.xilinx.com/r/en-US/ug1087-zynq-ultrascale-registers/GUCTL1-USB…
Register Name GUCTL1
Bit: 26
Bit Name: DEV_FORCE_20_CLK_FOR_30_CLK
Change in V2:
- Added CC stable kernel email.
Link: https://lore.kernel.org/all/20231005102725.8458-1-piyush.mehta@amd.com/
---
drivers/usb/dwc3/core.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index 9c6bf054f15d..0cf1fe60628b 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -1202,6 +1202,7 @@ static int dwc3_core_init(struct dwc3 *dwc)
reg |= DWC3_GUCTL1_PARKMODE_DISABLE_HS;
if (DWC3_VER_IS_WITHIN(DWC3, 290A, ANY) &&
+ (dwc->dr_mode == USB_DR_MODE_PERIPHERAL) &&
(dwc->maximum_speed == USB_SPEED_HIGH ||
dwc->maximum_speed == USB_SPEED_FULL))
reg |= DWC3_GUCTL1_DEV_FORCE_20_CLK_FOR_30_CLK;
--
2.17.1
The patch below does not apply to the 6.5-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.5.y
git checkout FETCH_HEAD
git cherry-pick -x 27e5ccc2d5a50ed61bb73153edb1066104b108b3
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100413-cringe-praising-fc38@gregkh' --subject-prefix 'PATCH 6.5.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 27e5ccc2d5a50ed61bb73153edb1066104b108b3 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Sat, 16 Sep 2023 12:52:49 +0200
Subject: [PATCH] mptcp: fix dangling connection hang-up
According to RFC 8684 section 3.3:
A connection is not closed unless [...] or an implementation-specific
connection-level send timeout.
Currently the MPTCP protocol does not implement such timeout, and
connection timing-out at the TCP-level never move to close state.
Introduces a catch-up condition at subflow close time to move the
MPTCP socket to close, too.
That additionally allows removing similar existing inside the worker.
Finally, allow some additional timeout for plain ESTABLISHED mptcp
sockets, as the protocol allows creating new subflows even at that
point and making the connection functional again.
This issue is actually present since the beginning, but it is basically
impossible to solve without a long chain of functional pre-requisites
topped by commit bbd49d114d57 ("mptcp: consolidate transition to
TCP_CLOSE in mptcp_do_fastclose()"). When backporting this current
patch, please also backport this other commit as well.
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/430
Fixes: e16163b6e2b7 ("mptcp: refactor shutdown and close")
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c8f38f303a90..e252539b1e19 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -892,6 +892,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
mptcp_sockopt_sync_locked(msk, ssk);
mptcp_subflow_joined(msk, ssk);
+ mptcp_stop_tout_timer(sk);
return true;
}
@@ -2369,18 +2370,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
bool dispose_it, need_push = false;
/* If the first subflow moved to a close state before accept, e.g. due
- * to an incoming reset, mptcp either:
- * - if either the subflow or the msk are dead, destroy the context
- * (the subflow socket is deleted by inet_child_forget) and the msk
- * - otherwise do nothing at the moment and take action at accept and/or
- * listener shutdown - user-space must be able to accept() the closed
- * socket.
+ * to an incoming reset or listener shutdown, the subflow socket is
+ * already deleted by inet_child_forget() and the mptcp socket can't
+ * survive too.
*/
- if (msk->in_accept_queue && msk->first == ssk) {
- if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD))
- return;
-
+ if (msk->in_accept_queue && msk->first == ssk &&
+ (sock_flag(sk, SOCK_DEAD) || sock_flag(ssk, SOCK_DEAD))) {
/* ensure later check in mptcp_worker() will dispose the msk */
+ mptcp_set_close_tout(sk, tcp_jiffies32 - (TCP_TIMEWAIT_LEN + 1));
sock_set_flag(sk, SOCK_DEAD);
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
mptcp_subflow_drop_ctx(ssk);
@@ -2443,6 +2440,22 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
out:
if (need_push)
__mptcp_push_pending(sk, 0);
+
+ /* Catch every 'all subflows closed' scenario, including peers silently
+ * closing them, e.g. due to timeout.
+ * For established sockets, allow an additional timeout before closing,
+ * as the protocol can still create more subflows.
+ */
+ if (list_is_singular(&msk->conn_list) && msk->first &&
+ inet_sk_state_load(msk->first) == TCP_CLOSE) {
+ if (sk->sk_state != TCP_ESTABLISHED ||
+ msk->in_accept_queue || sock_flag(sk, SOCK_DEAD)) {
+ inet_sk_state_store(sk, TCP_CLOSE);
+ mptcp_close_wake_up(sk);
+ } else {
+ mptcp_start_tout_timer(sk);
+ }
+ }
}
void mptcp_close_ssk(struct sock *sk, struct sock *ssk,
@@ -2486,23 +2499,14 @@ static void __mptcp_close_subflow(struct sock *sk)
}
-static bool mptcp_should_close(const struct sock *sk)
+static bool mptcp_close_tout_expired(const struct sock *sk)
{
- s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp;
- struct mptcp_subflow_context *subflow;
+ if (!inet_csk(sk)->icsk_mtup.probe_timestamp ||
+ sk->sk_state == TCP_CLOSE)
+ return false;
- if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue)
- return true;
-
- /* if all subflows are in closed status don't bother with additional
- * timeout
- */
- mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
- if (inet_sk_state_load(mptcp_subflow_tcp_sock(subflow)) !=
- TCP_CLOSE)
- return false;
- }
- return true;
+ return time_after32(tcp_jiffies32,
+ inet_csk(sk)->icsk_mtup.probe_timestamp + TCP_TIMEWAIT_LEN);
}
static void mptcp_check_fastclose(struct mptcp_sock *msk)
@@ -2641,15 +2645,16 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout)
struct sock *sk = (struct sock *)msk;
unsigned long timeout, close_timeout;
- if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
+ if (!fail_tout && !inet_csk(sk)->icsk_mtup.probe_timestamp)
return;
- close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
+ close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies +
+ TCP_TIMEWAIT_LEN;
/* the close timeout takes precedence on the fail one, and here at least one of
* them is active
*/
- timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
+ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout;
sk_reset_timer(sk, &sk->sk_timer, timeout);
}
@@ -2668,8 +2673,6 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
mptcp_subflow_reset(ssk);
WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
unlock_sock_fast(ssk, slow);
-
- mptcp_reset_tout_timer(msk, 0);
}
static void mptcp_do_fastclose(struct sock *sk)
@@ -2706,18 +2709,14 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
__mptcp_close_subflow(sk);
- /* There is no point in keeping around an orphaned sk timedout or
- * closed, but we need the msk around to reply to incoming DATA_FIN,
- * even if it is orphaned and in FIN_WAIT2 state
- */
- if (sock_flag(sk, SOCK_DEAD)) {
- if (mptcp_should_close(sk))
- mptcp_do_fastclose(sk);
+ if (mptcp_close_tout_expired(sk)) {
+ mptcp_do_fastclose(sk);
+ mptcp_close_wake_up(sk);
+ }
- if (sk->sk_state == TCP_CLOSE) {
- __mptcp_destroy_sock(sk);
- goto unlock;
- }
+ if (sock_flag(sk, SOCK_DEAD) && sk->sk_state == TCP_CLOSE) {
+ __mptcp_destroy_sock(sk);
+ goto unlock;
}
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
@@ -3016,7 +3015,6 @@ bool __mptcp_close(struct sock *sk, long timeout)
cleanup:
/* orphan all the subflows */
- inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
@@ -3053,7 +3051,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
- mptcp_reset_tout_timer(msk, 0);
+ mptcp_start_tout_timer(sk);
}
return do_cancel_work;
@@ -3117,7 +3115,7 @@ static int mptcp_disconnect(struct sock *sk, int flags)
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_stop_rtx_timer(sk);
- sk_stop_timer(sk, &sk->sk_timer);
+ mptcp_stop_tout_timer(sk);
if (msk->token)
mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 5e2026815c8e..ed61d6850cce 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -719,6 +719,28 @@ void mptcp_get_options(const struct sk_buff *skb,
void mptcp_finish_connect(struct sock *sk);
void __mptcp_set_connected(struct sock *sk);
void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout);
+
+static inline void mptcp_stop_tout_timer(struct sock *sk)
+{
+ if (!inet_csk(sk)->icsk_mtup.probe_timestamp)
+ return;
+
+ sk_stop_timer(sk, &sk->sk_timer);
+ inet_csk(sk)->icsk_mtup.probe_timestamp = 0;
+}
+
+static inline void mptcp_set_close_tout(struct sock *sk, unsigned long tout)
+{
+ /* avoid 0 timestamp, as that means no close timeout */
+ inet_csk(sk)->icsk_mtup.probe_timestamp = tout ? : 1;
+}
+
+static inline void mptcp_start_tout_timer(struct sock *sk)
+{
+ mptcp_set_close_tout(sk, tcp_jiffies32);
+ mptcp_reset_tout_timer(mptcp_sk(sk), 0);
+}
+
static inline bool mptcp_is_fully_established(struct sock *sk)
{
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 433f290984c8..918c1a235790 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1552,6 +1552,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
mptcp_sock_graft(ssk, sk->sk_socket);
iput(SOCK_INODE(sf));
WRITE_ONCE(msk->allow_infinite_fallback, false);
+ mptcp_stop_tout_timer(sk);
return 0;
failed_unlink:
On 10/5/23 00:24, Ron Economos wrote:
> On 10/4/23 10:52 AM, Greg Kroah-Hartman wrote:
>> This is the start of the stable review cycle for the 6.5.6 release.
>> There are 321 patches in this series, all will be posted as a response
>> to this one. If anyone has any issues with these being applied, please
>> let me know.
>>
>> Responses should be made by Fri, 06 Oct 2023 17:51:12 +0000.
>> Anything received after that time might be too late.
>>
>> The whole patch series can be found in one patch at:
>> https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.5.6-rc1.…
>> or in the git tree and branch at:
>> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.5.y
>> and the diffstat can be found below.
>>
>> thanks,
>>
>> greg k-h
>>
> On RISC-V RV64. Bisecting now.
Naresh already reported that they have tracked down missing commits, see:
https://lore.kernel.org/r/CA+G9fYvHPnba-0=uGS70EjcPgHht13j3s-_fmd2=srL0xyPj…
--
Florian
Hello,
This is the series I sent out earlier today for 5.10-6.1 backported
to 5.4. More adjustments were needed but still very modest.
Thanks,
Ilya
Ilya Dryomov (4):
rbd: move rbd_dev_refresh() definition
rbd: decouple header read-in from updating rbd_dev->header
rbd: decouple parent info read-in from updating rbd_dev
rbd: take header_rwsem in rbd_dev_refresh() only when updating
drivers/block/rbd.c | 420 ++++++++++++++++++++++++--------------------
1 file changed, 230 insertions(+), 190 deletions(-)
--
2.41.0
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x d5fbeff1ab812b6c473b6924bee8748469462e2c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100424-rumor-garter-af90@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d5fbeff1ab812b6c473b6924bee8748469462e2c Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni(a)redhat.com>
Date: Sat, 16 Sep 2023 12:52:46 +0200
Subject: [PATCH] mptcp: move __mptcp_error_report in protocol.c
This will simplify the next patch ("mptcp: process pending subflow error
on close").
No functional change intended.
Cc: stable(a)vger.kernel.org # v5.12+
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts <matthieu.baerts(a)tessares.net>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index a7fc16f5175d..915860027b1a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -770,6 +770,42 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
return moved;
}
+void __mptcp_error_report(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ int err = sock_error(ssk);
+ int ssk_state;
+
+ if (!err)
+ continue;
+
+ /* only propagate errors on fallen-back sockets or
+ * on MPC connect
+ */
+ if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
+ continue;
+
+ /* We need to propagate only transition to CLOSE state.
+ * Orphaned socket will see such state change via
+ * subflow_sched_work_if_closed() and that path will properly
+ * destroy the msk as needed.
+ */
+ ssk_state = inet_sk_state_load(ssk);
+ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+ inet_sk_state_store(sk, ssk_state);
+ WRITE_ONCE(sk->sk_err, -err);
+
+ /* This barrier is coupled with smp_rmb() in mptcp_poll() */
+ smp_wmb();
+ sk_error_report(sk);
+ break;
+ }
+}
+
/* In most cases we will be able to lock the mptcp socket. If its already
* owned, we need to defer to the work queue to avoid ABBA deadlock.
*/
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 9bf3c7bc1762..2f40c23fdb0d 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1362,42 +1362,6 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space)
*full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf));
}
-void __mptcp_error_report(struct sock *sk)
-{
- struct mptcp_subflow_context *subflow;
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- int err = sock_error(ssk);
- int ssk_state;
-
- if (!err)
- continue;
-
- /* only propagate errors on fallen-back sockets or
- * on MPC connect
- */
- if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
- continue;
-
- /* We need to propagate only transition to CLOSE state.
- * Orphaned socket will see such state change via
- * subflow_sched_work_if_closed() and that path will properly
- * destroy the msk as needed.
- */
- ssk_state = inet_sk_state_load(ssk);
- if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
- inet_sk_state_store(sk, ssk_state);
- WRITE_ONCE(sk->sk_err, -err);
-
- /* This barrier is coupled with smp_rmb() in mptcp_poll() */
- smp_wmb();
- sk_error_report(sk);
- break;
- }
-}
-
static void subflow_error_report(struct sock *ssk)
{
struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
This is an automatic generated email to let you know that the following patch were queued:
Subject: media: ccs: Correctly initialise try compose rectangle
Author: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Date: Mon Sep 4 15:57:37 2023 +0300
Initialise the try sink compose rectangle size to the sink compose
rectangle for binner and scaler sub-devices. This was missed due to the
faulty condition that lead to the compose rectangles to be initialised for
the pixel array sub-device where it is not relevant.
Fixes: ccfc97bdb5ae ("[media] smiapp: Add driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco(a)xs4all.nl>
drivers/media/i2c/ccs/ccs-core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
---
diff --git a/drivers/media/i2c/ccs/ccs-core.c b/drivers/media/i2c/ccs/ccs-core.c
index 6a8116454f87..022e8712d48e 100644
--- a/drivers/media/i2c/ccs/ccs-core.c
+++ b/drivers/media/i2c/ccs/ccs-core.c
@@ -3097,7 +3097,7 @@ static int ccs_open(struct v4l2_subdev *sd, struct v4l2_subdev_fh *fh)
try_fmt->code = sensor->internal_csi_format->code;
try_fmt->field = V4L2_FIELD_NONE;
- if (ssd != sensor->pixel_array)
+ if (ssd == sensor->pixel_array)
continue;
try_comp = v4l2_subdev_get_try_compose(sd, fh->state, i);
If same devices with same device IDs are present on different soundwire
buses, the probe fails due to conflicting device names and sysfs
entries:
sysfs: cannot create duplicate filename '/bus/soundwire/devices/sdw:0:0217:0204:00:0'
The link ID is 0 for both devices, so they should be differentiated by
bus ID. Add the bus ID so, the device names and sysfs entries look
like:
sdw:1:0:0217:0204:00:0 -> ../../../devices/platform/soc@0/6ab0000.soundwire-controller/sdw-master-1/sdw:1:0:0217:0204:00:0
sdw:3:0:0217:0204:00:0 -> ../../../devices/platform/soc@0/6b10000.soundwire-controller/sdw-master-3/sdw:3:0:0217:0204:00:0
Fixes: 7c3cd189b86d ("soundwire: Add Master registration")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)linaro.org>
---
Sending as RFT, because I did not test it on that many devices and
user-spaces.
---
drivers/soundwire/slave.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c
index c1c1a2ac293a..4db43ea53d47 100644
--- a/drivers/soundwire/slave.c
+++ b/drivers/soundwire/slave.c
@@ -39,14 +39,14 @@ int sdw_slave_add(struct sdw_bus *bus,
slave->dev.fwnode = fwnode;
if (id->unique_id == SDW_IGNORED_UNIQUE_ID) {
- /* name shall be sdw:link:mfg:part:class */
- dev_set_name(&slave->dev, "sdw:%01x:%04x:%04x:%02x",
- bus->link_id, id->mfg_id, id->part_id,
+ /* name shall be sdw:bus:link:mfg:part:class */
+ dev_set_name(&slave->dev, "sdw:%01x:%01x:%04x:%04x:%02x",
+ bus->id, bus->link_id, id->mfg_id, id->part_id,
id->class_id);
} else {
- /* name shall be sdw:link:mfg:part:class:unique */
- dev_set_name(&slave->dev, "sdw:%01x:%04x:%04x:%02x:%01x",
- bus->link_id, id->mfg_id, id->part_id,
+ /* name shall be sdw:bus:link:mfg:part:class:unique */
+ dev_set_name(&slave->dev, "sdw:%01x:%01x:%04x:%04x:%02x:%01x",
+ bus->id, bus->link_id, id->mfg_id, id->part_id,
id->class_id, id->unique_id);
}
--
2.34.1
From: Kory Maincent <kory.maincent(a)bootlin.com>
A bitset without mask in a _SET request means we want exactly the bits in
the bitset to be set. This works correctly for compact format but when
verbose format is parsed, ethnl_update_bitset32_verbose() only sets the
bits present in the request bitset but does not clear the rest. The commit
6699170376ab fixes this issue by clearing the whole target bitmap before we
start iterating. The solution proposed brought an issue with the behavior
of the mod variable. As the bitset is always cleared the old val will
always differ to the new val.
Fix it by adding a new temporary variable which save the state of the old
bitmap.
Fixes: 6699170376ab ("ethtool: fix application of verbose no_mask bitset")
Signed-off-by: Kory Maincent <kory.maincent(a)bootlin.com>
Cc: stable(a)vger.kernel.org
---
net/ethtool/bitset.c | 25 +++++++++++++++++++------
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
index 0515d6604b3b..95f11b0a38b4 100644
--- a/net/ethtool/bitset.c
+++ b/net/ethtool/bitset.c
@@ -432,7 +432,9 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
struct netlink_ext_ack *extack, bool *mod)
{
struct nlattr *bit_attr;
+ u32 *tmp = NULL;
bool no_mask;
+ bool dummy;
int rem;
int ret;
@@ -448,8 +450,11 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
}
no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
- if (no_mask)
- ethnl_bitmap32_clear(bitmap, 0, nbits, mod);
+ if (no_mask) {
+ tmp = kcalloc(nbits, sizeof(u32), GFP_KERNEL);
+ memcpy(tmp, bitmap, nbits);
+ ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy);
+ }
nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
bool old_val, new_val;
@@ -458,13 +463,18 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) {
NL_SET_ERR_MSG_ATTR(extack, bit_attr,
"only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask,
names, extack);
if (ret < 0)
- return ret;
- old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32));
+ goto out;
+ if (no_mask)
+ old_val = tmp[idx / 32] & ((u32)1 << (idx % 32));
+ else
+ old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32));
+
if (new_val != old_val) {
if (new_val)
bitmap[idx / 32] |= ((u32)1 << (idx % 32));
@@ -474,7 +484,10 @@ ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
}
}
- return 0;
+ ret = 0;
+out:
+ kfree(tmp);
+ return ret;
}
static int ethnl_compact_sanity_checks(unsigned int nbits,
--
2.25.1
While searching for possible refactor of napi_schedule_prep and
__napi_schedule it was notice that the mtk eth driver disable the
interrupt for rx and tx AFTER napi is scheduled.
While this is a very hard to repro case it might happen to have
situation where the interrupt is disabled and never enabled again as the
napi completes and the interrupt is enabled before.
This is caused by the fact that a napi driven by interrupt expect a
logic with:
1. interrupt received. napi prepared -> interrupt disabled -> napi
scheduled
2. napi triggered. ring cleared -> interrupt enabled -> wait for new
interrupt
To prevent this case, disable the interrupt BEFORE the napi is
scheduled.
Fixes: 656e705243fd ("net-next: mediatek: add support for MT7623 ethernet")
Cc: stable(a)vger.kernel.org
Signed-off-by: Christian Marangi <ansuelsmth(a)gmail.com>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 82b51072aad8..7669b446915a 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -3172,8 +3172,8 @@ static irqreturn_t mtk_handle_irq_rx(int irq, void *_eth)
eth->rx_events++;
if (likely(napi_schedule_prep(ð->rx_napi))) {
- __napi_schedule(ð->rx_napi);
mtk_rx_irq_disable(eth, eth->soc->txrx.rx_irq_done_mask);
+ __napi_schedule(ð->rx_napi);
}
return IRQ_HANDLED;
@@ -3185,8 +3185,8 @@ static irqreturn_t mtk_handle_irq_tx(int irq, void *_eth)
eth->tx_events++;
if (likely(napi_schedule_prep(ð->tx_napi))) {
- __napi_schedule(ð->tx_napi);
mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
+ __napi_schedule(ð->tx_napi);
}
return IRQ_HANDLED;
--
2.40.1
From: Miquel Raynal <miquel.raynal(a)bootlin.com>
Upstream commit 717c6ec241b5 ("can: sja1000: Prevent overrun stalls with
a soft reset on Renesas SoCs") fixes an issue with Renesas own SJA1000
CAN controller reception: the Rx buffer is only 5 messages long, so when
the bus loaded (eg. a message every 50us), overrun may easily
happen. Upon an overrun situation, due to a possible internal crosstalk
situation, the controller enters a frozen state which only can be
unlocked with a soft reset (experimentally). The solution was to offload
a call to sja1000_start() in a threaded handler. This needs to happen in
process context as this operation requires to sleep. sja1000_start()
basically enters "reset mode", performs a proper software reset and
returns back into "normal mode".
Since this fix was introduced, we no longer observe any stalls in
reception. However it was sporadically observed that the transmit path
would now freeze. Further investigation blamed the fix mentioned above,
and especially the reset operation. Reproducing the reset in a loop
helped identifying what could possibly go wrong. The sja1000 is a single
Tx queue device, which leverages the netdev helpers to process one Tx
message at a time. The logic is: the queue is stopped, the message sent
to the transceiver, once properly transmitted the controller sets a
status bit which triggers an interrupt, in the interrupt handler the
transmission status is checked and the queue woken up. Unfortunately, if
an overrun happens, we might perform the soft reset precisely between
the transmission of the buffer to the transceiver and the advent of the
transmission status bit. We would then stop the transmission operation
without re-enabling the queue, leading to all further transmissions to
be ignored.
The reset interrupt can only happen while the device is "open", and
after a reset we anyway want to resume normal operations, no matter if a
packet to transmit got dropped in the process, so we shall wake up the
queue. Restarting the device and waking-up the queue is exactly what
sja1000_set_mode(CAN_MODE_START) does. In order to be consistent about
the queue state, we must acquire a lock both in the reset handler and in
the transmit path to ensure serialization of both operations. It turns
out, a lock is already held when entering the transmit path, so we can
just acquire/release it as well with the regular net helpers inside the
threaded interrupt handler and this way we should be safe. As the
reset handler might still be called after the transmission of a frame to
the transceiver but before it actually gets transmitted, we must ensure
we don't leak the skb, so we free it (the behavior is consistent, no
matter if there was an skb on the stack or not).
Fixes: 717c6ec241b5 ("can: sja1000: Prevent overrun stalls with a soft reset on Renesas SoCs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Miquel Raynal <miquel.raynal(a)bootlin.com>
Link: https://lore.kernel.org/all/20231002160206.190953-1-miquel.raynal@bootlin.c…
[mkl: fixed call to can_free_echo_skb()]
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
---
drivers/net/can/sja1000/sja1000.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c
index 0ada0e160e93..743c2eb62b87 100644
--- a/drivers/net/can/sja1000/sja1000.c
+++ b/drivers/net/can/sja1000/sja1000.c
@@ -392,7 +392,13 @@ static irqreturn_t sja1000_reset_interrupt(int irq, void *dev_id)
struct net_device *dev = (struct net_device *)dev_id;
netdev_dbg(dev, "performing a soft reset upon overrun\n");
- sja1000_start(dev);
+
+ netif_tx_lock(dev);
+
+ can_free_echo_skb(dev, 0, NULL);
+ sja1000_set_mode(dev, CAN_MODE_START);
+
+ netif_tx_unlock(dev);
return IRQ_HANDLED;
}
--
2.40.1
Fix three issues with resctrl selftests.
The signal handling fix became necessary after the mount/umount fixes.
The other two came up when I ran resctrl selftests across the server
fleet in our lab to validate the upcoming CAT test rewrite (the rewrite
is not part of this series).
These are developed and should apply cleanly at least on top the
benchmark cleanup series (might apply cleanly also w/o the benchmark
series, I didn't test).
v2:
- Include patch to move _GNU_SOURCE to Makefile to allow normal #include
placement
- Rework the signal register/unregister into patch to use helpers
- Fixed incorrect function parameter description
- Use return !!res to avoid confusing implicit boolean conversion
- Improve MBA/MBM success bound patch's changelog
- Tweak Cc: stable dependencies (make it a chain).
Ilpo Järvinen (6):
selftests/resctrl: Extend signal handler coverage to unmount on
receiving signal
selftests/resctrl: Remove duplicate feature check from CMT test
selftests/resctrl: Move _GNU_SOURCE define into Makefile
selftests/resctrl: Refactor feature check to use resource and feature
name
selftests/resctrl: Fix feature checks
selftests/resctrl: Reduce failures due to outliers in MBA/MBM tests
tools/testing/selftests/resctrl/Makefile | 2 +-
tools/testing/selftests/resctrl/cat_test.c | 8 --
tools/testing/selftests/resctrl/cmt_test.c | 3 -
tools/testing/selftests/resctrl/mba_test.c | 2 +-
tools/testing/selftests/resctrl/mbm_test.c | 2 +-
tools/testing/selftests/resctrl/resctrl.h | 7 +-
.../testing/selftests/resctrl/resctrl_tests.c | 78 +++++++++++--------
tools/testing/selftests/resctrl/resctrl_val.c | 22 +++---
tools/testing/selftests/resctrl/resctrlfs.c | 69 +++++++---------
9 files changed, 88 insertions(+), 105 deletions(-)
--
2.30.2
The patch below does not apply to the 6.5-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.5.y
git checkout FETCH_HEAD
git cherry-pick -x 8b4d9469d0b0e553208ee6f62f2807111fde18b9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100443-freewill-shanty-0f05@gregkh' --subject-prefix 'PATCH 6.5.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8b4d9469d0b0e553208ee6f62f2807111fde18b9 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal(a)kernel.org>
Date: Tue, 5 Sep 2023 09:06:23 +0900
Subject: [PATCH] ata: libata-scsi: Fix delayed scsi_rescan_device() execution
Commit 6aa0365a3c85 ("ata: libata-scsi: Avoid deadlock on rescan after
device resume") modified ata_scsi_dev_rescan() to check the scsi device
"is_suspended" power field to ensure that the scsi device associated
with an ATA device is fully resumed when scsi_rescan_device() is
executed. However, this fix is problematic as:
1) It relies on a PM internal field that should not be used without PM
device locking protection.
2) The check for is_suspended and the call to scsi_rescan_device() are
not atomic and a suspend PM event may be triggered between them,
casuing scsi_rescan_device() to be called on a suspended device and
in that function blocking while holding the scsi device lock. This
would deadlock a following resume operation.
These problems can trigger PM deadlocks on resume, especially with
resume operations triggered quickly after or during suspend operations.
E.g., a simple bash script like:
for (( i=0; i<10; i++ )); do
echo "+2 > /sys/class/rtc/rtc0/wakealarm
echo mem > /sys/power/state
done
that triggers a resume 2 seconds after starting suspending a system can
quickly lead to a PM deadlock preventing the system from correctly
resuming.
Fix this by replacing the check on is_suspended with a check on the
return value given by scsi_rescan_device() as that function will fail if
called against a suspended device. Also make sure rescan tasks already
scheduled are first cancelled before suspending an ata port.
Fixes: 6aa0365a3c85 ("ata: libata-scsi: Avoid deadlock on rescan after device resume")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal(a)kernel.org>
Reviewed-by: Hannes Reinecke <hare(a)suse.de>
Reviewed-by: Niklas Cassel <niklas.cassel(a)wdc.com>
Tested-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index a0bc01606b30..092372334e92 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5168,11 +5168,27 @@ static const unsigned int ata_port_suspend_ehi = ATA_EHI_QUIET
static void ata_port_suspend(struct ata_port *ap, pm_message_t mesg)
{
+ /*
+ * We are about to suspend the port, so we do not care about
+ * scsi_rescan_device() calls scheduled by previous resume operations.
+ * The next resume will schedule the rescan again. So cancel any rescan
+ * that is not done yet.
+ */
+ cancel_delayed_work_sync(&ap->scsi_rescan_task);
+
ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, false);
}
static void ata_port_suspend_async(struct ata_port *ap, pm_message_t mesg)
{
+ /*
+ * We are about to suspend the port, so we do not care about
+ * scsi_rescan_device() calls scheduled by previous resume operations.
+ * The next resume will schedule the rescan again. So cancel any rescan
+ * that is not done yet.
+ */
+ cancel_delayed_work_sync(&ap->scsi_rescan_task);
+
ata_port_request_pm(ap, mesg, 0, ata_port_suspend_ehi, true);
}
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index a0e58d22d222..6850cac803c1 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -4756,7 +4756,7 @@ void ata_scsi_dev_rescan(struct work_struct *work)
struct ata_link *link;
struct ata_device *dev;
unsigned long flags;
- bool delay_rescan = false;
+ int ret = 0;
mutex_lock(&ap->scsi_scan_mutex);
spin_lock_irqsave(ap->lock, flags);
@@ -4765,37 +4765,34 @@ void ata_scsi_dev_rescan(struct work_struct *work)
ata_for_each_dev(dev, link, ENABLED) {
struct scsi_device *sdev = dev->sdev;
+ /*
+ * If the port was suspended before this was scheduled,
+ * bail out.
+ */
+ if (ap->pflags & ATA_PFLAG_SUSPENDED)
+ goto unlock;
+
if (!sdev)
continue;
if (scsi_device_get(sdev))
continue;
- /*
- * If the rescan work was scheduled because of a resume
- * event, the port is already fully resumed, but the
- * SCSI device may not yet be fully resumed. In such
- * case, executing scsi_rescan_device() may cause a
- * deadlock with the PM code on device_lock(). Prevent
- * this by giving up and retrying rescan after a short
- * delay.
- */
- delay_rescan = sdev->sdev_gendev.power.is_suspended;
- if (delay_rescan) {
- scsi_device_put(sdev);
- break;
- }
-
spin_unlock_irqrestore(ap->lock, flags);
- scsi_rescan_device(sdev);
+ ret = scsi_rescan_device(sdev);
scsi_device_put(sdev);
spin_lock_irqsave(ap->lock, flags);
+
+ if (ret)
+ goto unlock;
}
}
+unlock:
spin_unlock_irqrestore(ap->lock, flags);
mutex_unlock(&ap->scsi_scan_mutex);
- if (delay_rescan)
+ /* Reschedule with a delay if scsi_rescan_device() returned an error */
+ if (ret)
schedule_delayed_work(&ap->scsi_rescan_task,
msecs_to_jiffies(5));
}
The patch below does not apply to the 6.5-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.5.y
git checkout FETCH_HEAD
git cherry-pick -x ff48b37802e5c134e2dfc4d091f10b2eb5065a72
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100454-hamper-falsify-595d@gregkh' --subject-prefix 'PATCH 6.5.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ff48b37802e5c134e2dfc4d091f10b2eb5065a72 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal(a)kernel.org>
Date: Fri, 15 Sep 2023 15:00:13 +0900
Subject: [PATCH] scsi: Do not attempt to rescan suspended devices
scsi_rescan_device() takes a scsi device lock before executing a device
handler and device driver rescan methods. Waiting for the completion of
any command issued to the device by these methods will thus be done with
the device lock held. As a result, there is a risk of deadlocking within
the power management code if scsi_rescan_device() is called to handle a
device resume with the associated scsi device not yet resumed.
Avoid such situation by checking that the target scsi device is in the
running state, that is, fully capable of executing commands, before
proceeding with the rescan and bailout returning -EWOULDBLOCK otherwise.
With this error return, the caller can retry rescaning the device after
a delay.
The state check is done with the device lock held and is thus safe
against incoming suspend power management operations.
Fixes: 6aa0365a3c85 ("ata: libata-scsi: Avoid deadlock on rescan after device resume")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <dlemoal(a)kernel.org>
Reviewed-by: Hannes Reinecke <hare(a)suse.de>
Reviewed-by: Niklas Cassel <niklas.cassel(a)wdc.com>
Tested-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
Reviewed-by: Martin K. Petersen <martin.petersen(a)oracle.com>
Reviewed-by: Bart Van Assche <bvanassche(a)acm.org>
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 52014b2d39e1..3db4d31a03a1 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -1619,12 +1619,24 @@ int scsi_add_device(struct Scsi_Host *host, uint channel,
}
EXPORT_SYMBOL(scsi_add_device);
-void scsi_rescan_device(struct scsi_device *sdev)
+int scsi_rescan_device(struct scsi_device *sdev)
{
struct device *dev = &sdev->sdev_gendev;
+ int ret = 0;
device_lock(dev);
+ /*
+ * Bail out if the device is not running. Otherwise, the rescan may
+ * block waiting for commands to be executed, with us holding the
+ * device lock. This can result in a potential deadlock in the power
+ * management core code when system resume is on-going.
+ */
+ if (sdev->sdev_state != SDEV_RUNNING) {
+ ret = -EWOULDBLOCK;
+ goto unlock;
+ }
+
scsi_attach_vpd(sdev);
scsi_cdl_check(sdev);
@@ -1638,7 +1650,11 @@ void scsi_rescan_device(struct scsi_device *sdev)
drv->rescan(dev);
module_put(dev->driver->owner);
}
+
+unlock:
device_unlock(dev);
+
+ return ret;
}
EXPORT_SYMBOL(scsi_rescan_device);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 49f768d0ff37..4c2dc8150c6d 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -764,7 +764,7 @@ scsi_template_proc_dir(const struct scsi_host_template *sht);
#define scsi_template_proc_dir(sht) NULL
#endif
extern void scsi_scan_host(struct Scsi_Host *);
-extern void scsi_rescan_device(struct scsi_device *);
+extern int scsi_rescan_device(struct scsi_device *sdev);
extern void scsi_remove_host(struct Scsi_Host *);
extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *);
extern int scsi_host_busy(struct Scsi_Host *shost);
The quilt patch titled
Subject: mips: use nth_page() in place of direct struct page manipulation
has been removed from the -mm tree. Its filename was
mips-use-nth_page-in-place-of-direct-struct-page-manipulation.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zi Yan <ziy(a)nvidia.com>
Subject: mips: use nth_page() in place of direct struct page manipulation
Date: Wed, 13 Sep 2023 16:12:48 -0400
__flush_dcache_pages() is called during hugetlb migration via
migrate_pages() -> migrate_hugetlbs() -> unmap_and_move_huge_page() ->
move_to_new_folio() -> flush_dcache_folio(). And with hugetlb and without
sparsemem vmemmap, struct page is not guaranteed to be contiguous beyond a
section. Use nth_page() instead.
Without the fix, a wrong address might be used for data cache page flush.
No bug is reported. The fix comes from code inspection.
Link: https://lkml.kernel.org/r/20230913201248.452081-6-zi.yan@sent.com
Fixes: 15fa3e8e3269 ("mips: implement the new page table range API")
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Mike Rapoport (IBM) <rppt(a)kernel.org>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/mips/mm/cache.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/mips/mm/cache.c~mips-use-nth_page-in-place-of-direct-struct-page-manipulation
+++ a/arch/mips/mm/cache.c
@@ -117,7 +117,7 @@ void __flush_dcache_pages(struct page *p
* get faulted into the tlb (and thus flushed) anyways.
*/
for (i = 0; i < nr; i++) {
- addr = (unsigned long)kmap_local_page(page + i);
+ addr = (unsigned long)kmap_local_page(nth_page(page, i));
flush_data_cache_page(addr);
kunmap_local((void *)addr);
}
_
Patches currently in -mm which might be from ziy(a)nvidia.com are
The quilt patch titled
Subject: mm/cma: use nth_page() in place of direct struct page manipulation
has been removed from the -mm tree. Its filename was
mm-cma-use-nth_page-in-place-of-direct-struct-page-manipulation.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zi Yan <ziy(a)nvidia.com>
Subject: mm/cma: use nth_page() in place of direct struct page manipulation
Date: Wed, 13 Sep 2023 16:12:44 -0400
Patch series "Use nth_page() in place of direct struct page manipulation",
v3.
On SPARSEMEM without VMEMMAP, struct page is not guaranteed to be
contiguous, since each memory section's memmap might be allocated
independently. hugetlb pages can go beyond a memory section size, thus
direct struct page manipulation on hugetlb pages/subpages might give wrong
struct page. Kernel provides nth_page() to do the manipulation properly.
Use that whenever code can see hugetlb pages.
This patch (of 5):
When dealing with hugetlb pages, manipulating struct page pointers
directly can get to wrong struct page, since struct page is not guaranteed
to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle
it properly.
Without the fix, page_kasan_tag_reset() could reset wrong page tags,
causing a wrong kasan result. No related bug is reported. The fix
comes from code inspection.
Link: https://lkml.kernel.org/r/20230913201248.452081-1-zi.yan@sent.com
Link: https://lkml.kernel.org/r/20230913201248.452081-2-zi.yan@sent.com
Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc")
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Mike Rapoport (IBM) <rppt(a)kernel.org>
Cc: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/cma.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/cma.c~mm-cma-use-nth_page-in-place-of-direct-struct-page-manipulation
+++ a/mm/cma.c
@@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma,
*/
if (page) {
for (i = 0; i < count; i++)
- page_kasan_tag_reset(page + i);
+ page_kasan_tag_reset(nth_page(page, i));
}
if (ret && !no_warn) {
_
Patches currently in -mm which might be from ziy(a)nvidia.com are
The quilt patch titled
Subject: fs: use nth_page() in place of direct struct page manipulation
has been removed from the -mm tree. Its filename was
fs-use-nth_page-in-place-of-direct-struct-page-manipulation.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zi Yan <ziy(a)nvidia.com>
Subject: fs: use nth_page() in place of direct struct page manipulation
Date: Wed, 13 Sep 2023 16:12:47 -0400
When dealing with hugetlb pages, struct page is not guaranteed to be
contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it
properly.
Without the fix, a wrong subpage might be checked for HWPoison, causing wrong
number of bytes of a page copied to user space. No bug is reported. The fix
comes from code inspection.
Link: https://lkml.kernel.org/r/20230913201248.452081-5-zi.yan@sent.com
Fixes: 38c1ddbde6c6 ("hugetlbfs: improve read HWPOISON hugepage")
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Mike Rapoport (IBM) <rppt(a)kernel.org>
Cc: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/hugetlbfs/inode.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/fs/hugetlbfs/inode.c~fs-use-nth_page-in-place-of-direct-struct-page-manipulation
+++ a/fs/hugetlbfs/inode.c
@@ -295,7 +295,7 @@ static size_t adjust_range_hwpoison(stru
size_t res = 0;
/* First subpage to start the loop. */
- page += offset / PAGE_SIZE;
+ page = nth_page(page, offset / PAGE_SIZE);
offset %= PAGE_SIZE;
while (1) {
if (is_raw_hwpoison_page_in_hugepage(page))
@@ -309,7 +309,7 @@ static size_t adjust_range_hwpoison(stru
break;
offset += n;
if (offset == PAGE_SIZE) {
- page++;
+ page = nth_page(page, 1);
offset = 0;
}
}
_
Patches currently in -mm which might be from ziy(a)nvidia.com are
The quilt patch titled
Subject: mm/memory_hotplug: use pfn math in place of direct struct page manipulation
has been removed from the -mm tree. Its filename was
mm-memory_hotplug-use-pfn-math-in-place-of-direct-struct-page-manipulation.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zi Yan <ziy(a)nvidia.com>
Subject: mm/memory_hotplug: use pfn math in place of direct struct page manipulation
Date: Wed, 13 Sep 2023 16:12:46 -0400
When dealing with hugetlb pages, manipulating struct page pointers
directly can get to wrong struct page, since struct page is not guaranteed
to be contiguous on SPARSEMEM without VMEMMAP. Use pfn calculation to
handle it properly.
Without the fix, a wrong number of page might be skipped. Since skip cannot be
negative, scan_movable_page() will end early and might miss a movable page with
-ENOENT. This might fail offline_pages(). No bug is reported. The fix comes
from code inspection.
Link: https://lkml.kernel.org/r/20230913201248.452081-4-zi.yan@sent.com
Fixes: eeb0efd071d8 ("mm,memory_hotplug: fix scan_movable_pages() for gigantic hugepages")
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Mike Rapoport (IBM) <rppt(a)kernel.org>
Cc: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory_hotplug.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/memory_hotplug.c~mm-memory_hotplug-use-pfn-math-in-place-of-direct-struct-page-manipulation
+++ a/mm/memory_hotplug.c
@@ -1689,7 +1689,7 @@ static int scan_movable_pages(unsigned l
*/
if (HPageMigratable(head))
goto found;
- skip = compound_nr(head) - (page - head);
+ skip = compound_nr(head) - (pfn - page_to_pfn(head));
pfn += skip - 1;
}
return -ENOENT;
_
Patches currently in -mm which might be from ziy(a)nvidia.com are
The quilt patch titled
Subject: mm/hugetlb: use nth_page() in place of direct struct page manipulation
has been removed from the -mm tree. Its filename was
mm-hugetlb-use-nth_page-in-place-of-direct-struct-page-manipulation.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zi Yan <ziy(a)nvidia.com>
Subject: mm/hugetlb: use nth_page() in place of direct struct page manipulation
Date: Wed, 13 Sep 2023 16:12:45 -0400
When dealing with hugetlb pages, manipulating struct page pointers
directly can get to wrong struct page, since struct page is not guaranteed
to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle
it properly.
A wrong or non-existing page might be tried to be grabbed, either
leading to a non freeable page or kernel memory access errors. No bug
is reported. It comes from code inspection.
Link: https://lkml.kernel.org/r/20230913201248.452081-3-zi.yan@sent.com
Fixes: 57a196a58421 ("hugetlb: simplify hugetlb handling in follow_page_mask")
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Muchun Song <songmuchun(a)bytedance.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Mike Kravetz <mike.kravetz(a)oracle.com>
Cc: Mike Rapoport (IBM) <rppt(a)kernel.org>
Cc: Thomas Bogendoerfer <tsbogend(a)alpha.franken.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/hugetlb.c~mm-hugetlb-use-nth_page-in-place-of-direct-struct-page-manipulation
+++ a/mm/hugetlb.c
@@ -6493,7 +6493,7 @@ struct page *hugetlb_follow_page_mask(st
}
}
- page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
/*
* Note that page may be a sub-page, and with vmemmap
_
Patches currently in -mm which might be from ziy(a)nvidia.com are
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 7b086755fb8cdbb6b3e45a1bbddc00e7f9b1dc03
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100429-postnasal-hassle-02be@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7b086755fb8cdbb6b3e45a1bbddc00e7f9b1dc03 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes(a)cmpxchg.org>
Date: Mon, 11 Sep 2023 14:11:08 -0400
Subject: [PATCH] mm: page_alloc: fix CMA and HIGHATOMIC landing on the wrong
buddy list
Commit 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock")
bypasses the pcplist on lock contention and returns the page directly to
the buddy list of the page's migratetype.
For pages that don't have their own pcplist, such as CMA and HIGHATOMIC,
the migratetype is temporarily updated such that the page can hitch a ride
on the MOVABLE pcplist. Their true type is later reassessed when flushing
in free_pcppages_bulk(). However, when lock contention is detected after
the type was already overridden, the bypass will then put the page on the
wrong buddy list.
Once on the MOVABLE buddy list, the page becomes eligible for fallbacks
and even stealing. In the case of HIGHATOMIC, otherwise ineligible
allocations can dip into the highatomic reserves. In the case of CMA, the
page can be lost from the CMA region permanently.
Use a separate pcpmigratetype variable for the pcplist override. Use the
original migratetype when going directly to the buddy. This fixes the bug
and should make the intentions more obvious in the code.
Originally sent here to address the HIGHATOMIC case:
https://lore.kernel.org/lkml/20230821183733.106619-4-hannes@cmpxchg.org/
Changelog updated in response to the CMA-specific bug report.
[mgorman(a)techsingularity.net: updated changelog]
Link: https://lkml.kernel.org/r/20230911181108.GA104295@cmpxchg.org
Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock")
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Reported-by: Joe Liu <joe.liu(a)mediatek.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0c5be12f9336..95546f376302 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2400,7 +2400,7 @@ void free_unref_page(struct page *page, unsigned int order)
struct per_cpu_pages *pcp;
struct zone *zone;
unsigned long pfn = page_to_pfn(page);
- int migratetype;
+ int migratetype, pcpmigratetype;
if (!free_unref_page_prepare(page, pfn, order))
return;
@@ -2408,24 +2408,24 @@ void free_unref_page(struct page *page, unsigned int order)
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Place ISOLATE pages on the isolated list because they are being
- * offlined but treat HIGHATOMIC as movable pages so we can get those
- * areas back if necessary. Otherwise, we may have to free
+ * offlined but treat HIGHATOMIC and CMA as movable pages so we can
+ * get those areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
- migratetype = get_pcppage_migratetype(page);
+ migratetype = pcpmigratetype = get_pcppage_migratetype(page);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
return;
}
- migratetype = MIGRATE_MOVABLE;
+ pcpmigratetype = MIGRATE_MOVABLE;
}
zone = page_zone(page);
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
- free_unref_page_commit(zone, pcp, page, migratetype, order);
+ free_unref_page_commit(zone, pcp, page, pcpmigratetype, order);
pcp_spin_unlock(pcp);
} else {
free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
Recent changes to kernel_connect() and kernel_bind() ensure that
callers are insulated from changes to the address parameter made by BPF
SOCK_ADDR hooks. This patch wraps direct calls to ops->connect() and
ops->bind() with kernel_connect() and kernel_bind() to ensure that SMB
mounts do not see their mount address overwritten in such cases.
Link: https://lore.kernel.org/netdev/9944248dba1bce861375fcce9de663934d933ba9.cam…
Cc: <stable(a)vger.kernel.org> # 6.x.y
Signed-off-by: Jordan Rife <jrife(a)google.com>
---
fs/smb/client/connect.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 3902e90dca6b0..ce11165446cfb 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -2895,9 +2895,9 @@ bind_socket(struct TCP_Server_Info *server)
if (server->srcaddr.ss_family != AF_UNSPEC) {
/* Bind to the specified local IP address */
struct socket *socket = server->ssocket;
- rc = socket->ops->bind(socket,
- (struct sockaddr *) &server->srcaddr,
- sizeof(server->srcaddr));
+ rc = kernel_bind(socket,
+ (struct sockaddr *) &server->srcaddr,
+ sizeof(server->srcaddr));
if (rc < 0) {
struct sockaddr_in *saddr4;
struct sockaddr_in6 *saddr6;
@@ -3046,8 +3046,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
socket->sk->sk_sndbuf,
socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
- rc = socket->ops->connect(socket, saddr, slen,
- server->noblockcnt ? O_NONBLOCK : 0);
+ rc = kernel_connect(socket, saddr, slen,
+ server->noblockcnt ? O_NONBLOCK : 0);
/*
* When mounting SMB root file systems, we do not want to block in
* connect. Otherwise bail out and then let cifs_reconnect() perform
--
2.42.0.582.g8ccd20d70d-goog
Hi Greg,
> On 4 Oct 2023, at 16:58, gregkh(a)linuxfoundation.org wrote:
>
> 
> This is a note to let you know that I've just added the patch titled
>
> ASoC: amd: yc: Fix non-functional mic on Lenovo 82QF and 82UG
>
> to the 6.1-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> asoc-amd-yc-fix-non-functional-mic-on-lenovo-82qf-and-82ug.patch
> and it can be found in the queue-6.1 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
> From 1263cc0f414d212129c0f1289b49b7df77f92084 Mon Sep 17 00:00:00 2001
> From: August Wikerfors <git(a)augustwikerfors.se>
> Date: Mon, 11 Sep 2023 23:34:09 +0200
> Subject: ASoC: amd: yc: Fix non-functional mic on Lenovo 82QF and 82UG
>
> From: August Wikerfors <git(a)augustwikerfors.se>
>
> commit 1263cc0f414d212129c0f1289b49b7df77f92084 upstream.
>
> Like the Lenovo 82TL and 82V2, the Lenovo 82QF (Yoga 7 14ARB7) and 82UG
> (Legion S7 16ARHA7) both need a quirk entry for the internal microphone to
> function. Commit c008323fe361 ("ASoC: amd: yc: Fix a non-functional mic on
> Lenovo 82SJ") restricted the quirk that previously matched "82" to "82V2",
> breaking microphone functionality on these devices. Fix this by adding
> specific quirks for these models, as was done for the Lenovo 82TL.
>
> Fixes: c008323fe361 ("ASoC: amd: yc: Fix a non-functional mic on Lenovo 82SJ")
> Closes: https://github.com/tomsom/yoga-linux/issues/51
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=208555#c780
> Cc: stable(a)vger.kernel.org
> Signed-off-by: August Wikerfors <git(a)augustwikerfors.se>
> Link: https://lore.kernel.org/r/20230911213409.6106-1-git@augustwikerfors.se
> Signed-off-by: Mark Brown <broonie(a)kernel.org>
> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Please also add commit cfff2a7794d2 ("ASoC: amd: yc: Fix a non-functional mic on Lenovo 82TL") to 6.1+ as it fixed the same regression for another model (but wasn’t tagged for stable).
Regards,
August Wikerfors
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100407-ruby-cancel-cdd0@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 19:01:03 +0200
Subject: [PATCH] rbd: take header_rwsem in rbd_dev_refresh() only when
updating
rbd_dev_refresh() has been holding header_rwsem across header and
parent info read-in unnecessarily for ages. With commit 870611e4877e
("rbd: get snapshot context after exclusive lock is ensured to be
held"), the potential for deadlocks became much more real owning to
a) header_rwsem now nesting inside lock_rwsem and b) rw_semaphores
not allowing new readers after a writer is registered.
For example, assuming that I/O request 1, I/O request 2 and header
read-in request all target the same OSD:
1. I/O request 1 comes in and gets submitted
2. watch error occurs
3. rbd_watch_errcb() takes lock_rwsem for write, clears owner_cid and
releases lock_rwsem
4. after reestablishing the watch, rbd_reregister_watch() calls
rbd_dev_refresh() which takes header_rwsem for write and submits
a header read-in request
5. I/O request 2 comes in: after taking lock_rwsem for read in
__rbd_img_handle_request(), it blocks trying to take header_rwsem
for read in rbd_img_object_requests()
6. another watch error occurs
7. rbd_watch_errcb() blocks trying to take lock_rwsem for write
8. I/O request 1 completion is received by the messenger but can't be
processed because lock_rwsem won't be granted anymore
9. header read-in request completion can't be received, let alone
processed, because the messenger is stranded
Change rbd_dev_refresh() to take header_rwsem only for actually
updating rbd_dev->header. Header and parent info read-in don't need
any locking.
Cc: stable(a)vger.kernel.org # 0b035401c570: rbd: move rbd_dev_refresh() definition
Cc: stable(a)vger.kernel.org # 510a7330c82a: rbd: decouple header read-in from updating rbd_dev->header
Cc: stable(a)vger.kernel.org # c10311776f0a: rbd: decouple parent info read-in from updating rbd_dev
Cc: stable(a)vger.kernel.org
Fixes: 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held")
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d62a0298c890..a999b698b131 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -6986,7 +6986,14 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
- rbd_dev->header.image_size = header->image_size;
+ if (rbd_dev->header.image_size != header->image_size) {
+ rbd_dev->header.image_size = header->image_size;
+
+ if (!rbd_is_snap(rbd_dev)) {
+ rbd_dev->mapping.size = header->image_size;
+ rbd_dev_update_size(rbd_dev);
+ }
+ }
ceph_put_snap_context(rbd_dev->header.snapc);
rbd_dev->header.snapc = header->snapc;
@@ -7044,11 +7051,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
struct parent_image_info pii = { 0 };
- u64 mapping_size;
int ret;
- down_write(&rbd_dev->header_rwsem);
- mapping_size = rbd_dev->mapping.size;
+ dout("%s rbd_dev %p\n", __func__, rbd_dev);
ret = rbd_dev_header_info(rbd_dev, &header, false);
if (ret)
@@ -7064,18 +7069,13 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
goto out;
}
+ down_write(&rbd_dev->header_rwsem);
rbd_dev_update_header(rbd_dev, &header);
if (rbd_dev->parent)
rbd_dev_update_parent(rbd_dev, &pii);
-
- rbd_assert(!rbd_is_snap(rbd_dev));
- rbd_dev->mapping.size = rbd_dev->header.image_size;
+ up_write(&rbd_dev->header_rwsem);
out:
- up_write(&rbd_dev->header_rwsem);
- if (!ret && mapping_size != rbd_dev->mapping.size)
- rbd_dev_update_size(rbd_dev);
-
rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100406-routine-tackle-b16a@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 19:01:03 +0200
Subject: [PATCH] rbd: take header_rwsem in rbd_dev_refresh() only when
updating
rbd_dev_refresh() has been holding header_rwsem across header and
parent info read-in unnecessarily for ages. With commit 870611e4877e
("rbd: get snapshot context after exclusive lock is ensured to be
held"), the potential for deadlocks became much more real owning to
a) header_rwsem now nesting inside lock_rwsem and b) rw_semaphores
not allowing new readers after a writer is registered.
For example, assuming that I/O request 1, I/O request 2 and header
read-in request all target the same OSD:
1. I/O request 1 comes in and gets submitted
2. watch error occurs
3. rbd_watch_errcb() takes lock_rwsem for write, clears owner_cid and
releases lock_rwsem
4. after reestablishing the watch, rbd_reregister_watch() calls
rbd_dev_refresh() which takes header_rwsem for write and submits
a header read-in request
5. I/O request 2 comes in: after taking lock_rwsem for read in
__rbd_img_handle_request(), it blocks trying to take header_rwsem
for read in rbd_img_object_requests()
6. another watch error occurs
7. rbd_watch_errcb() blocks trying to take lock_rwsem for write
8. I/O request 1 completion is received by the messenger but can't be
processed because lock_rwsem won't be granted anymore
9. header read-in request completion can't be received, let alone
processed, because the messenger is stranded
Change rbd_dev_refresh() to take header_rwsem only for actually
updating rbd_dev->header. Header and parent info read-in don't need
any locking.
Cc: stable(a)vger.kernel.org # 0b035401c570: rbd: move rbd_dev_refresh() definition
Cc: stable(a)vger.kernel.org # 510a7330c82a: rbd: decouple header read-in from updating rbd_dev->header
Cc: stable(a)vger.kernel.org # c10311776f0a: rbd: decouple parent info read-in from updating rbd_dev
Cc: stable(a)vger.kernel.org
Fixes: 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held")
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d62a0298c890..a999b698b131 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -6986,7 +6986,14 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
- rbd_dev->header.image_size = header->image_size;
+ if (rbd_dev->header.image_size != header->image_size) {
+ rbd_dev->header.image_size = header->image_size;
+
+ if (!rbd_is_snap(rbd_dev)) {
+ rbd_dev->mapping.size = header->image_size;
+ rbd_dev_update_size(rbd_dev);
+ }
+ }
ceph_put_snap_context(rbd_dev->header.snapc);
rbd_dev->header.snapc = header->snapc;
@@ -7044,11 +7051,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
struct parent_image_info pii = { 0 };
- u64 mapping_size;
int ret;
- down_write(&rbd_dev->header_rwsem);
- mapping_size = rbd_dev->mapping.size;
+ dout("%s rbd_dev %p\n", __func__, rbd_dev);
ret = rbd_dev_header_info(rbd_dev, &header, false);
if (ret)
@@ -7064,18 +7069,13 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
goto out;
}
+ down_write(&rbd_dev->header_rwsem);
rbd_dev_update_header(rbd_dev, &header);
if (rbd_dev->parent)
rbd_dev_update_parent(rbd_dev, &pii);
-
- rbd_assert(!rbd_is_snap(rbd_dev));
- rbd_dev->mapping.size = rbd_dev->header.image_size;
+ up_write(&rbd_dev->header_rwsem);
out:
- up_write(&rbd_dev->header_rwsem);
- if (!ret && mapping_size != rbd_dev->mapping.size)
- rbd_dev_update_size(rbd_dev);
-
rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100404-duffel-hungrily-2206@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 19:01:03 +0200
Subject: [PATCH] rbd: take header_rwsem in rbd_dev_refresh() only when
updating
rbd_dev_refresh() has been holding header_rwsem across header and
parent info read-in unnecessarily for ages. With commit 870611e4877e
("rbd: get snapshot context after exclusive lock is ensured to be
held"), the potential for deadlocks became much more real owning to
a) header_rwsem now nesting inside lock_rwsem and b) rw_semaphores
not allowing new readers after a writer is registered.
For example, assuming that I/O request 1, I/O request 2 and header
read-in request all target the same OSD:
1. I/O request 1 comes in and gets submitted
2. watch error occurs
3. rbd_watch_errcb() takes lock_rwsem for write, clears owner_cid and
releases lock_rwsem
4. after reestablishing the watch, rbd_reregister_watch() calls
rbd_dev_refresh() which takes header_rwsem for write and submits
a header read-in request
5. I/O request 2 comes in: after taking lock_rwsem for read in
__rbd_img_handle_request(), it blocks trying to take header_rwsem
for read in rbd_img_object_requests()
6. another watch error occurs
7. rbd_watch_errcb() blocks trying to take lock_rwsem for write
8. I/O request 1 completion is received by the messenger but can't be
processed because lock_rwsem won't be granted anymore
9. header read-in request completion can't be received, let alone
processed, because the messenger is stranded
Change rbd_dev_refresh() to take header_rwsem only for actually
updating rbd_dev->header. Header and parent info read-in don't need
any locking.
Cc: stable(a)vger.kernel.org # 0b035401c570: rbd: move rbd_dev_refresh() definition
Cc: stable(a)vger.kernel.org # 510a7330c82a: rbd: decouple header read-in from updating rbd_dev->header
Cc: stable(a)vger.kernel.org # c10311776f0a: rbd: decouple parent info read-in from updating rbd_dev
Cc: stable(a)vger.kernel.org
Fixes: 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held")
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d62a0298c890..a999b698b131 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -6986,7 +6986,14 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
- rbd_dev->header.image_size = header->image_size;
+ if (rbd_dev->header.image_size != header->image_size) {
+ rbd_dev->header.image_size = header->image_size;
+
+ if (!rbd_is_snap(rbd_dev)) {
+ rbd_dev->mapping.size = header->image_size;
+ rbd_dev_update_size(rbd_dev);
+ }
+ }
ceph_put_snap_context(rbd_dev->header.snapc);
rbd_dev->header.snapc = header->snapc;
@@ -7044,11 +7051,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
struct parent_image_info pii = { 0 };
- u64 mapping_size;
int ret;
- down_write(&rbd_dev->header_rwsem);
- mapping_size = rbd_dev->mapping.size;
+ dout("%s rbd_dev %p\n", __func__, rbd_dev);
ret = rbd_dev_header_info(rbd_dev, &header, false);
if (ret)
@@ -7064,18 +7069,13 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
goto out;
}
+ down_write(&rbd_dev->header_rwsem);
rbd_dev_update_header(rbd_dev, &header);
if (rbd_dev->parent)
rbd_dev_update_parent(rbd_dev, &pii);
-
- rbd_assert(!rbd_is_snap(rbd_dev));
- rbd_dev->mapping.size = rbd_dev->header.image_size;
+ up_write(&rbd_dev->header_rwsem);
out:
- up_write(&rbd_dev->header_rwsem);
- if (!ret && mapping_size != rbd_dev->mapping.size)
- rbd_dev_update_size(rbd_dev);
-
rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100402-stomp-wrist-122b@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0b207d02bd9ab8dcc31b262ca9f60dbc1822500d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 19:01:03 +0200
Subject: [PATCH] rbd: take header_rwsem in rbd_dev_refresh() only when
updating
rbd_dev_refresh() has been holding header_rwsem across header and
parent info read-in unnecessarily for ages. With commit 870611e4877e
("rbd: get snapshot context after exclusive lock is ensured to be
held"), the potential for deadlocks became much more real owning to
a) header_rwsem now nesting inside lock_rwsem and b) rw_semaphores
not allowing new readers after a writer is registered.
For example, assuming that I/O request 1, I/O request 2 and header
read-in request all target the same OSD:
1. I/O request 1 comes in and gets submitted
2. watch error occurs
3. rbd_watch_errcb() takes lock_rwsem for write, clears owner_cid and
releases lock_rwsem
4. after reestablishing the watch, rbd_reregister_watch() calls
rbd_dev_refresh() which takes header_rwsem for write and submits
a header read-in request
5. I/O request 2 comes in: after taking lock_rwsem for read in
__rbd_img_handle_request(), it blocks trying to take header_rwsem
for read in rbd_img_object_requests()
6. another watch error occurs
7. rbd_watch_errcb() blocks trying to take lock_rwsem for write
8. I/O request 1 completion is received by the messenger but can't be
processed because lock_rwsem won't be granted anymore
9. header read-in request completion can't be received, let alone
processed, because the messenger is stranded
Change rbd_dev_refresh() to take header_rwsem only for actually
updating rbd_dev->header. Header and parent info read-in don't need
any locking.
Cc: stable(a)vger.kernel.org # 0b035401c570: rbd: move rbd_dev_refresh() definition
Cc: stable(a)vger.kernel.org # 510a7330c82a: rbd: decouple header read-in from updating rbd_dev->header
Cc: stable(a)vger.kernel.org # c10311776f0a: rbd: decouple parent info read-in from updating rbd_dev
Cc: stable(a)vger.kernel.org
Fixes: 870611e4877e ("rbd: get snapshot context after exclusive lock is ensured to be held")
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d62a0298c890..a999b698b131 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -6986,7 +6986,14 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
- rbd_dev->header.image_size = header->image_size;
+ if (rbd_dev->header.image_size != header->image_size) {
+ rbd_dev->header.image_size = header->image_size;
+
+ if (!rbd_is_snap(rbd_dev)) {
+ rbd_dev->mapping.size = header->image_size;
+ rbd_dev_update_size(rbd_dev);
+ }
+ }
ceph_put_snap_context(rbd_dev->header.snapc);
rbd_dev->header.snapc = header->snapc;
@@ -7044,11 +7051,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
struct parent_image_info pii = { 0 };
- u64 mapping_size;
int ret;
- down_write(&rbd_dev->header_rwsem);
- mapping_size = rbd_dev->mapping.size;
+ dout("%s rbd_dev %p\n", __func__, rbd_dev);
ret = rbd_dev_header_info(rbd_dev, &header, false);
if (ret)
@@ -7064,18 +7069,13 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
goto out;
}
+ down_write(&rbd_dev->header_rwsem);
rbd_dev_update_header(rbd_dev, &header);
if (rbd_dev->parent)
rbd_dev_update_parent(rbd_dev, &pii);
-
- rbd_assert(!rbd_is_snap(rbd_dev));
- rbd_dev->mapping.size = rbd_dev->header.image_size;
+ up_write(&rbd_dev->header_rwsem);
out:
- up_write(&rbd_dev->header_rwsem);
- if (!ret && mapping_size != rbd_dev->mapping.size)
- rbd_dev_update_size(rbd_dev);
-
rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x c10311776f0a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100450-headrest-ecologist-205f@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c10311776f0a8ddea2276df96e255625b07045a8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 18:38:26 +0200
Subject: [PATCH] rbd: decouple parent info read-in from updating rbd_dev
Unlike header read-in, parent info read-in is already decoupled in
get_parent_info(), but it's buried in rbd_dev_v2_parent_info() along
with the processing logic.
Separate the initial read-in and update read-in logic into
rbd_dev_setup_parent() and rbd_dev_update_parent() respectively and
have rbd_dev_v2_parent_info() just populate struct parent_image_info
(i.e. what get_parent_info() did). Some existing QoI issues, like
flatten of a standalone clone being disregarded on refresh, remain.
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6ed5520ef303..d62a0298c890 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5594,6 +5594,14 @@ struct parent_image_info {
u64 overlap;
};
+static void rbd_parent_info_cleanup(struct parent_image_info *pii)
+{
+ kfree(pii->pool_ns);
+ kfree(pii->image_id);
+
+ memset(pii, 0, sizeof(*pii));
+}
+
/*
* The caller is responsible for @pii.
*/
@@ -5663,6 +5671,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
if (pii->has_overlap)
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
@@ -5701,14 +5712,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
pii->has_overlap = true;
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
return -EINVAL;
}
-static int get_parent_info(struct rbd_device *rbd_dev,
- struct parent_image_info *pii)
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
{
struct page *req_page, *reply_page;
void *p;
@@ -5736,7 +5750,7 @@ static int get_parent_info(struct rbd_device *rbd_dev,
return ret;
}
-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
{
struct rbd_spec *parent_spec;
struct parent_image_info pii = { 0 };
@@ -5746,37 +5760,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
if (!parent_spec)
return -ENOMEM;
- ret = get_parent_info(rbd_dev, &pii);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out_err;
- dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
- __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
- pii.has_overlap, pii.overlap);
-
- if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
- /*
- * Either the parent never existed, or we have
- * record of it but the image got flattened so it no
- * longer has a parent. When the parent of a
- * layered image disappears we immediately set the
- * overlap to 0. The effect of this is that all new
- * requests will be treated as if the image had no
- * parent.
- *
- * If !pii.has_overlap, the parent image spec is not
- * applicable. It's there to avoid duplication in each
- * snapshot record.
- */
- if (rbd_dev->parent_overlap) {
- rbd_dev->parent_overlap = 0;
- rbd_dev_parent_put(rbd_dev);
- pr_info("%s: clone image has been flattened\n",
- rbd_dev->disk->disk_name);
- }
-
+ if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
goto out; /* No parent? No problem. */
- }
/* The ceph file layout needs to fit pool id in 32 bits */
@@ -5788,46 +5777,34 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
/*
- * The parent won't change (except when the clone is
- * flattened, already handled that). So we only need to
- * record the parent spec we have not already done so.
+ * The parent won't change except when the clone is flattened,
+ * so we only need to record the parent image spec once.
*/
- if (!rbd_dev->parent_spec) {
- parent_spec->pool_id = pii.pool_id;
- if (pii.pool_ns && *pii.pool_ns) {
- parent_spec->pool_ns = pii.pool_ns;
- pii.pool_ns = NULL;
- }
- parent_spec->image_id = pii.image_id;
- pii.image_id = NULL;
- parent_spec->snap_id = pii.snap_id;
-
- rbd_dev->parent_spec = parent_spec;
- parent_spec = NULL; /* rbd_dev now owns this */
+ parent_spec->pool_id = pii.pool_id;
+ if (pii.pool_ns && *pii.pool_ns) {
+ parent_spec->pool_ns = pii.pool_ns;
+ pii.pool_ns = NULL;
}
+ parent_spec->image_id = pii.image_id;
+ pii.image_id = NULL;
+ parent_spec->snap_id = pii.snap_id;
+
+ rbd_assert(!rbd_dev->parent_spec);
+ rbd_dev->parent_spec = parent_spec;
+ parent_spec = NULL; /* rbd_dev now owns this */
/*
- * We always update the parent overlap. If it's zero we issue
- * a warning, as we will proceed as if there was no parent.
+ * Record the parent overlap. If it's zero, issue a warning as
+ * we will proceed as if there is no parent.
*/
- if (!pii.overlap) {
- if (parent_spec) {
- /* refresh, careful to warn just once */
- if (rbd_dev->parent_overlap)
- rbd_warn(rbd_dev,
- "clone now standalone (overlap became 0)");
- } else {
- /* initial probe */
- rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
- }
- }
+ if (!pii.overlap)
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
rbd_dev->parent_overlap = pii.overlap;
out:
ret = 0;
out_err:
- kfree(pii.pool_ns);
- kfree(pii.image_id);
+ rbd_parent_info_cleanup(&pii);
rbd_spec_put(parent_spec);
return ret;
}
@@ -6977,7 +6954,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
}
if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_setup_parent(rbd_dev);
if (ret)
goto err_out_probe;
}
@@ -7026,9 +7003,47 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
}
}
+static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
+{
+ if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
+ /*
+ * Either the parent never existed, or we have
+ * record of it but the image got flattened so it no
+ * longer has a parent. When the parent of a
+ * layered image disappears we immediately set the
+ * overlap to 0. The effect of this is that all new
+ * requests will be treated as if the image had no
+ * parent.
+ *
+ * If !pii.has_overlap, the parent image spec is not
+ * applicable. It's there to avoid duplication in each
+ * snapshot record.
+ */
+ if (rbd_dev->parent_overlap) {
+ rbd_dev->parent_overlap = 0;
+ rbd_dev_parent_put(rbd_dev);
+ pr_info("%s: clone has been flattened\n",
+ rbd_dev->disk->disk_name);
+ }
+ } else {
+ rbd_assert(rbd_dev->parent_spec);
+
+ /*
+ * Update the parent overlap. If it became zero, issue
+ * a warning as we will proceed as if there is no parent.
+ */
+ if (!pii->overlap && rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone has become standalone (overlap 0)");
+ rbd_dev->parent_overlap = pii->overlap;
+ }
+}
+
static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
+ struct parent_image_info pii = { 0 };
u64 mapping_size;
int ret;
@@ -7044,12 +7059,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
* mapped image getting flattened.
*/
if (rbd_dev->parent) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out;
}
rbd_dev_update_header(rbd_dev, &header);
+ if (rbd_dev->parent)
+ rbd_dev_update_parent(rbd_dev, &pii);
rbd_assert(!rbd_is_snap(rbd_dev));
rbd_dev->mapping.size = rbd_dev->header.image_size;
@@ -7059,6 +7076,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
if (!ret && mapping_size != rbd_dev->mapping.size)
rbd_dev_update_size(rbd_dev);
+ rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x c10311776f0a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100449-sulk-spray-e75b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c10311776f0a8ddea2276df96e255625b07045a8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 18:38:26 +0200
Subject: [PATCH] rbd: decouple parent info read-in from updating rbd_dev
Unlike header read-in, parent info read-in is already decoupled in
get_parent_info(), but it's buried in rbd_dev_v2_parent_info() along
with the processing logic.
Separate the initial read-in and update read-in logic into
rbd_dev_setup_parent() and rbd_dev_update_parent() respectively and
have rbd_dev_v2_parent_info() just populate struct parent_image_info
(i.e. what get_parent_info() did). Some existing QoI issues, like
flatten of a standalone clone being disregarded on refresh, remain.
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6ed5520ef303..d62a0298c890 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5594,6 +5594,14 @@ struct parent_image_info {
u64 overlap;
};
+static void rbd_parent_info_cleanup(struct parent_image_info *pii)
+{
+ kfree(pii->pool_ns);
+ kfree(pii->image_id);
+
+ memset(pii, 0, sizeof(*pii));
+}
+
/*
* The caller is responsible for @pii.
*/
@@ -5663,6 +5671,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
if (pii->has_overlap)
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
@@ -5701,14 +5712,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
pii->has_overlap = true;
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
return -EINVAL;
}
-static int get_parent_info(struct rbd_device *rbd_dev,
- struct parent_image_info *pii)
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
{
struct page *req_page, *reply_page;
void *p;
@@ -5736,7 +5750,7 @@ static int get_parent_info(struct rbd_device *rbd_dev,
return ret;
}
-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
{
struct rbd_spec *parent_spec;
struct parent_image_info pii = { 0 };
@@ -5746,37 +5760,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
if (!parent_spec)
return -ENOMEM;
- ret = get_parent_info(rbd_dev, &pii);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out_err;
- dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
- __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
- pii.has_overlap, pii.overlap);
-
- if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
- /*
- * Either the parent never existed, or we have
- * record of it but the image got flattened so it no
- * longer has a parent. When the parent of a
- * layered image disappears we immediately set the
- * overlap to 0. The effect of this is that all new
- * requests will be treated as if the image had no
- * parent.
- *
- * If !pii.has_overlap, the parent image spec is not
- * applicable. It's there to avoid duplication in each
- * snapshot record.
- */
- if (rbd_dev->parent_overlap) {
- rbd_dev->parent_overlap = 0;
- rbd_dev_parent_put(rbd_dev);
- pr_info("%s: clone image has been flattened\n",
- rbd_dev->disk->disk_name);
- }
-
+ if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
goto out; /* No parent? No problem. */
- }
/* The ceph file layout needs to fit pool id in 32 bits */
@@ -5788,46 +5777,34 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
/*
- * The parent won't change (except when the clone is
- * flattened, already handled that). So we only need to
- * record the parent spec we have not already done so.
+ * The parent won't change except when the clone is flattened,
+ * so we only need to record the parent image spec once.
*/
- if (!rbd_dev->parent_spec) {
- parent_spec->pool_id = pii.pool_id;
- if (pii.pool_ns && *pii.pool_ns) {
- parent_spec->pool_ns = pii.pool_ns;
- pii.pool_ns = NULL;
- }
- parent_spec->image_id = pii.image_id;
- pii.image_id = NULL;
- parent_spec->snap_id = pii.snap_id;
-
- rbd_dev->parent_spec = parent_spec;
- parent_spec = NULL; /* rbd_dev now owns this */
+ parent_spec->pool_id = pii.pool_id;
+ if (pii.pool_ns && *pii.pool_ns) {
+ parent_spec->pool_ns = pii.pool_ns;
+ pii.pool_ns = NULL;
}
+ parent_spec->image_id = pii.image_id;
+ pii.image_id = NULL;
+ parent_spec->snap_id = pii.snap_id;
+
+ rbd_assert(!rbd_dev->parent_spec);
+ rbd_dev->parent_spec = parent_spec;
+ parent_spec = NULL; /* rbd_dev now owns this */
/*
- * We always update the parent overlap. If it's zero we issue
- * a warning, as we will proceed as if there was no parent.
+ * Record the parent overlap. If it's zero, issue a warning as
+ * we will proceed as if there is no parent.
*/
- if (!pii.overlap) {
- if (parent_spec) {
- /* refresh, careful to warn just once */
- if (rbd_dev->parent_overlap)
- rbd_warn(rbd_dev,
- "clone now standalone (overlap became 0)");
- } else {
- /* initial probe */
- rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
- }
- }
+ if (!pii.overlap)
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
rbd_dev->parent_overlap = pii.overlap;
out:
ret = 0;
out_err:
- kfree(pii.pool_ns);
- kfree(pii.image_id);
+ rbd_parent_info_cleanup(&pii);
rbd_spec_put(parent_spec);
return ret;
}
@@ -6977,7 +6954,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
}
if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_setup_parent(rbd_dev);
if (ret)
goto err_out_probe;
}
@@ -7026,9 +7003,47 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
}
}
+static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
+{
+ if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
+ /*
+ * Either the parent never existed, or we have
+ * record of it but the image got flattened so it no
+ * longer has a parent. When the parent of a
+ * layered image disappears we immediately set the
+ * overlap to 0. The effect of this is that all new
+ * requests will be treated as if the image had no
+ * parent.
+ *
+ * If !pii.has_overlap, the parent image spec is not
+ * applicable. It's there to avoid duplication in each
+ * snapshot record.
+ */
+ if (rbd_dev->parent_overlap) {
+ rbd_dev->parent_overlap = 0;
+ rbd_dev_parent_put(rbd_dev);
+ pr_info("%s: clone has been flattened\n",
+ rbd_dev->disk->disk_name);
+ }
+ } else {
+ rbd_assert(rbd_dev->parent_spec);
+
+ /*
+ * Update the parent overlap. If it became zero, issue
+ * a warning as we will proceed as if there is no parent.
+ */
+ if (!pii->overlap && rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone has become standalone (overlap 0)");
+ rbd_dev->parent_overlap = pii->overlap;
+ }
+}
+
static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
+ struct parent_image_info pii = { 0 };
u64 mapping_size;
int ret;
@@ -7044,12 +7059,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
* mapped image getting flattened.
*/
if (rbd_dev->parent) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out;
}
rbd_dev_update_header(rbd_dev, &header);
+ if (rbd_dev->parent)
+ rbd_dev_update_parent(rbd_dev, &pii);
rbd_assert(!rbd_is_snap(rbd_dev));
rbd_dev->mapping.size = rbd_dev->header.image_size;
@@ -7059,6 +7076,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
if (!ret && mapping_size != rbd_dev->mapping.size)
rbd_dev_update_size(rbd_dev);
+ rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x c10311776f0a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100448-email-overkill-3d70@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c10311776f0a8ddea2276df96e255625b07045a8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 18:38:26 +0200
Subject: [PATCH] rbd: decouple parent info read-in from updating rbd_dev
Unlike header read-in, parent info read-in is already decoupled in
get_parent_info(), but it's buried in rbd_dev_v2_parent_info() along
with the processing logic.
Separate the initial read-in and update read-in logic into
rbd_dev_setup_parent() and rbd_dev_update_parent() respectively and
have rbd_dev_v2_parent_info() just populate struct parent_image_info
(i.e. what get_parent_info() did). Some existing QoI issues, like
flatten of a standalone clone being disregarded on refresh, remain.
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6ed5520ef303..d62a0298c890 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5594,6 +5594,14 @@ struct parent_image_info {
u64 overlap;
};
+static void rbd_parent_info_cleanup(struct parent_image_info *pii)
+{
+ kfree(pii->pool_ns);
+ kfree(pii->image_id);
+
+ memset(pii, 0, sizeof(*pii));
+}
+
/*
* The caller is responsible for @pii.
*/
@@ -5663,6 +5671,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
if (pii->has_overlap)
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
@@ -5701,14 +5712,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
pii->has_overlap = true;
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
return -EINVAL;
}
-static int get_parent_info(struct rbd_device *rbd_dev,
- struct parent_image_info *pii)
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
{
struct page *req_page, *reply_page;
void *p;
@@ -5736,7 +5750,7 @@ static int get_parent_info(struct rbd_device *rbd_dev,
return ret;
}
-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
{
struct rbd_spec *parent_spec;
struct parent_image_info pii = { 0 };
@@ -5746,37 +5760,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
if (!parent_spec)
return -ENOMEM;
- ret = get_parent_info(rbd_dev, &pii);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out_err;
- dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
- __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
- pii.has_overlap, pii.overlap);
-
- if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
- /*
- * Either the parent never existed, or we have
- * record of it but the image got flattened so it no
- * longer has a parent. When the parent of a
- * layered image disappears we immediately set the
- * overlap to 0. The effect of this is that all new
- * requests will be treated as if the image had no
- * parent.
- *
- * If !pii.has_overlap, the parent image spec is not
- * applicable. It's there to avoid duplication in each
- * snapshot record.
- */
- if (rbd_dev->parent_overlap) {
- rbd_dev->parent_overlap = 0;
- rbd_dev_parent_put(rbd_dev);
- pr_info("%s: clone image has been flattened\n",
- rbd_dev->disk->disk_name);
- }
-
+ if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
goto out; /* No parent? No problem. */
- }
/* The ceph file layout needs to fit pool id in 32 bits */
@@ -5788,46 +5777,34 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
/*
- * The parent won't change (except when the clone is
- * flattened, already handled that). So we only need to
- * record the parent spec we have not already done so.
+ * The parent won't change except when the clone is flattened,
+ * so we only need to record the parent image spec once.
*/
- if (!rbd_dev->parent_spec) {
- parent_spec->pool_id = pii.pool_id;
- if (pii.pool_ns && *pii.pool_ns) {
- parent_spec->pool_ns = pii.pool_ns;
- pii.pool_ns = NULL;
- }
- parent_spec->image_id = pii.image_id;
- pii.image_id = NULL;
- parent_spec->snap_id = pii.snap_id;
-
- rbd_dev->parent_spec = parent_spec;
- parent_spec = NULL; /* rbd_dev now owns this */
+ parent_spec->pool_id = pii.pool_id;
+ if (pii.pool_ns && *pii.pool_ns) {
+ parent_spec->pool_ns = pii.pool_ns;
+ pii.pool_ns = NULL;
}
+ parent_spec->image_id = pii.image_id;
+ pii.image_id = NULL;
+ parent_spec->snap_id = pii.snap_id;
+
+ rbd_assert(!rbd_dev->parent_spec);
+ rbd_dev->parent_spec = parent_spec;
+ parent_spec = NULL; /* rbd_dev now owns this */
/*
- * We always update the parent overlap. If it's zero we issue
- * a warning, as we will proceed as if there was no parent.
+ * Record the parent overlap. If it's zero, issue a warning as
+ * we will proceed as if there is no parent.
*/
- if (!pii.overlap) {
- if (parent_spec) {
- /* refresh, careful to warn just once */
- if (rbd_dev->parent_overlap)
- rbd_warn(rbd_dev,
- "clone now standalone (overlap became 0)");
- } else {
- /* initial probe */
- rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
- }
- }
+ if (!pii.overlap)
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
rbd_dev->parent_overlap = pii.overlap;
out:
ret = 0;
out_err:
- kfree(pii.pool_ns);
- kfree(pii.image_id);
+ rbd_parent_info_cleanup(&pii);
rbd_spec_put(parent_spec);
return ret;
}
@@ -6977,7 +6954,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
}
if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_setup_parent(rbd_dev);
if (ret)
goto err_out_probe;
}
@@ -7026,9 +7003,47 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
}
}
+static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
+{
+ if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
+ /*
+ * Either the parent never existed, or we have
+ * record of it but the image got flattened so it no
+ * longer has a parent. When the parent of a
+ * layered image disappears we immediately set the
+ * overlap to 0. The effect of this is that all new
+ * requests will be treated as if the image had no
+ * parent.
+ *
+ * If !pii.has_overlap, the parent image spec is not
+ * applicable. It's there to avoid duplication in each
+ * snapshot record.
+ */
+ if (rbd_dev->parent_overlap) {
+ rbd_dev->parent_overlap = 0;
+ rbd_dev_parent_put(rbd_dev);
+ pr_info("%s: clone has been flattened\n",
+ rbd_dev->disk->disk_name);
+ }
+ } else {
+ rbd_assert(rbd_dev->parent_spec);
+
+ /*
+ * Update the parent overlap. If it became zero, issue
+ * a warning as we will proceed as if there is no parent.
+ */
+ if (!pii->overlap && rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone has become standalone (overlap 0)");
+ rbd_dev->parent_overlap = pii->overlap;
+ }
+}
+
static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
+ struct parent_image_info pii = { 0 };
u64 mapping_size;
int ret;
@@ -7044,12 +7059,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
* mapped image getting flattened.
*/
if (rbd_dev->parent) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out;
}
rbd_dev_update_header(rbd_dev, &header);
+ if (rbd_dev->parent)
+ rbd_dev_update_parent(rbd_dev, &pii);
rbd_assert(!rbd_is_snap(rbd_dev));
rbd_dev->mapping.size = rbd_dev->header.image_size;
@@ -7059,6 +7076,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
if (!ret && mapping_size != rbd_dev->mapping.size)
rbd_dev_update_size(rbd_dev);
+ rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x c10311776f0a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023100447-slate-onward-bd1f@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c10311776f0a8ddea2276df96e255625b07045a8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov(a)gmail.com>
Date: Wed, 20 Sep 2023 18:38:26 +0200
Subject: [PATCH] rbd: decouple parent info read-in from updating rbd_dev
Unlike header read-in, parent info read-in is already decoupled in
get_parent_info(), but it's buried in rbd_dev_v2_parent_info() along
with the processing logic.
Separate the initial read-in and update read-in logic into
rbd_dev_setup_parent() and rbd_dev_update_parent() respectively and
have rbd_dev_v2_parent_info() just populate struct parent_image_info
(i.e. what get_parent_info() did). Some existing QoI issues, like
flatten of a standalone clone being disregarded on refresh, remain.
Signed-off-by: Ilya Dryomov <idryomov(a)gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang(a)easystack.cn>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6ed5520ef303..d62a0298c890 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5594,6 +5594,14 @@ struct parent_image_info {
u64 overlap;
};
+static void rbd_parent_info_cleanup(struct parent_image_info *pii)
+{
+ kfree(pii->pool_ns);
+ kfree(pii->image_id);
+
+ memset(pii, 0, sizeof(*pii));
+}
+
/*
* The caller is responsible for @pii.
*/
@@ -5663,6 +5671,9 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
if (pii->has_overlap)
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
@@ -5701,14 +5712,17 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
pii->has_overlap = true;
ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+ dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+ __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+ pii->has_overlap, pii->overlap);
return 0;
e_inval:
return -EINVAL;
}
-static int get_parent_info(struct rbd_device *rbd_dev,
- struct parent_image_info *pii)
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
{
struct page *req_page, *reply_page;
void *p;
@@ -5736,7 +5750,7 @@ static int get_parent_info(struct rbd_device *rbd_dev,
return ret;
}
-static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
{
struct rbd_spec *parent_spec;
struct parent_image_info pii = { 0 };
@@ -5746,37 +5760,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
if (!parent_spec)
return -ENOMEM;
- ret = get_parent_info(rbd_dev, &pii);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out_err;
- dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
- __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
- pii.has_overlap, pii.overlap);
-
- if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
- /*
- * Either the parent never existed, or we have
- * record of it but the image got flattened so it no
- * longer has a parent. When the parent of a
- * layered image disappears we immediately set the
- * overlap to 0. The effect of this is that all new
- * requests will be treated as if the image had no
- * parent.
- *
- * If !pii.has_overlap, the parent image spec is not
- * applicable. It's there to avoid duplication in each
- * snapshot record.
- */
- if (rbd_dev->parent_overlap) {
- rbd_dev->parent_overlap = 0;
- rbd_dev_parent_put(rbd_dev);
- pr_info("%s: clone image has been flattened\n",
- rbd_dev->disk->disk_name);
- }
-
+ if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
goto out; /* No parent? No problem. */
- }
/* The ceph file layout needs to fit pool id in 32 bits */
@@ -5788,46 +5777,34 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
/*
- * The parent won't change (except when the clone is
- * flattened, already handled that). So we only need to
- * record the parent spec we have not already done so.
+ * The parent won't change except when the clone is flattened,
+ * so we only need to record the parent image spec once.
*/
- if (!rbd_dev->parent_spec) {
- parent_spec->pool_id = pii.pool_id;
- if (pii.pool_ns && *pii.pool_ns) {
- parent_spec->pool_ns = pii.pool_ns;
- pii.pool_ns = NULL;
- }
- parent_spec->image_id = pii.image_id;
- pii.image_id = NULL;
- parent_spec->snap_id = pii.snap_id;
-
- rbd_dev->parent_spec = parent_spec;
- parent_spec = NULL; /* rbd_dev now owns this */
+ parent_spec->pool_id = pii.pool_id;
+ if (pii.pool_ns && *pii.pool_ns) {
+ parent_spec->pool_ns = pii.pool_ns;
+ pii.pool_ns = NULL;
}
+ parent_spec->image_id = pii.image_id;
+ pii.image_id = NULL;
+ parent_spec->snap_id = pii.snap_id;
+
+ rbd_assert(!rbd_dev->parent_spec);
+ rbd_dev->parent_spec = parent_spec;
+ parent_spec = NULL; /* rbd_dev now owns this */
/*
- * We always update the parent overlap. If it's zero we issue
- * a warning, as we will proceed as if there was no parent.
+ * Record the parent overlap. If it's zero, issue a warning as
+ * we will proceed as if there is no parent.
*/
- if (!pii.overlap) {
- if (parent_spec) {
- /* refresh, careful to warn just once */
- if (rbd_dev->parent_overlap)
- rbd_warn(rbd_dev,
- "clone now standalone (overlap became 0)");
- } else {
- /* initial probe */
- rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
- }
- }
+ if (!pii.overlap)
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
rbd_dev->parent_overlap = pii.overlap;
out:
ret = 0;
out_err:
- kfree(pii.pool_ns);
- kfree(pii.image_id);
+ rbd_parent_info_cleanup(&pii);
rbd_spec_put(parent_spec);
return ret;
}
@@ -6977,7 +6954,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
}
if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_setup_parent(rbd_dev);
if (ret)
goto err_out_probe;
}
@@ -7026,9 +7003,47 @@ static void rbd_dev_update_header(struct rbd_device *rbd_dev,
}
}
+static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
+ struct parent_image_info *pii)
+{
+ if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
+ /*
+ * Either the parent never existed, or we have
+ * record of it but the image got flattened so it no
+ * longer has a parent. When the parent of a
+ * layered image disappears we immediately set the
+ * overlap to 0. The effect of this is that all new
+ * requests will be treated as if the image had no
+ * parent.
+ *
+ * If !pii.has_overlap, the parent image spec is not
+ * applicable. It's there to avoid duplication in each
+ * snapshot record.
+ */
+ if (rbd_dev->parent_overlap) {
+ rbd_dev->parent_overlap = 0;
+ rbd_dev_parent_put(rbd_dev);
+ pr_info("%s: clone has been flattened\n",
+ rbd_dev->disk->disk_name);
+ }
+ } else {
+ rbd_assert(rbd_dev->parent_spec);
+
+ /*
+ * Update the parent overlap. If it became zero, issue
+ * a warning as we will proceed as if there is no parent.
+ */
+ if (!pii->overlap && rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone has become standalone (overlap 0)");
+ rbd_dev->parent_overlap = pii->overlap;
+ }
+}
+
static int rbd_dev_refresh(struct rbd_device *rbd_dev)
{
struct rbd_image_header header = { 0 };
+ struct parent_image_info pii = { 0 };
u64 mapping_size;
int ret;
@@ -7044,12 +7059,14 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
* mapped image getting flattened.
*/
if (rbd_dev->parent) {
- ret = rbd_dev_v2_parent_info(rbd_dev);
+ ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
if (ret)
goto out;
}
rbd_dev_update_header(rbd_dev, &header);
+ if (rbd_dev->parent)
+ rbd_dev_update_parent(rbd_dev, &pii);
rbd_assert(!rbd_is_snap(rbd_dev));
rbd_dev->mapping.size = rbd_dev->header.image_size;
@@ -7059,6 +7076,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
if (!ret && mapping_size != rbd_dev->mapping.size)
rbd_dev_update_size(rbd_dev);
+ rbd_parent_info_cleanup(&pii);
rbd_image_header_cleanup(&header);
return ret;
}