From: Liang He windhl@126.com
[ Upstream commit 7c32919a378782c95c72bc028b5c30dfe8c11f82 ]
In omap4_sram_init(), of_find_compatible_node() will return a node pointer with refcount incremented. We should use of_node_put() when it is not used anymore.
Signed-off-by: Liang He windhl@126.com Message-Id: 20220628112939.160737-1-windhl@126.com Signed-off-by: Tony Lindgren tony@atomide.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm/mach-omap2/omap4-common.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm/mach-omap2/omap4-common.c b/arch/arm/mach-omap2/omap4-common.c index 6d1eb4eefefe5..d9ed2a5dcd5ef 100644 --- a/arch/arm/mach-omap2/omap4-common.c +++ b/arch/arm/mach-omap2/omap4-common.c @@ -140,6 +140,7 @@ static int __init omap4_sram_init(void) __func__); else sram_sync = (void __iomem *)gen_pool_alloc(sram_pool, PAGE_SIZE); + of_node_put(np);
return 0; }
From: Konrad Dybcio konrad.dybcio@linaro.org
[ Upstream commit 67fb53745e0b38275fa0b422b6a3c6c1c028c9a2 ]
On eMMC devices, the UFS clocks aren't started in the bootloader (or well, at least it should not be, as that would just leak power..), which results in platform reboots when trying to access the unclocked UFS hardware, which unfortunately happens on each and every boot, as interconnect calls sync_state and goes over each and every path.
Signed-off-by: Konrad Dybcio konrad.dybcio@linaro.org Reviewed-by: Dmitry Baryshkov dmitry.baryshkov@linaro.org Tested-by: Dmitry Baryshkov dmitry.baryshkov@linaro.org #db820c Signed-off-by: Bjorn Andersson andersson@kernel.org Link: https://lore.kernel.org/r/20221210200353.418391-6-konrad.dybcio@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/boot/dts/qcom/msm8996.dtsi | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi index 1107befc3b091..2344a9802ff54 100644 --- a/arch/arm64/boot/dts/qcom/msm8996.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi @@ -829,9 +829,11 @@ a2noc: interconnect@583000 { compatible = "qcom,msm8996-a2noc"; reg = <0x00583000 0x7000>; #interconnect-cells = <1>; - clock-names = "bus", "bus_a"; + clock-names = "bus", "bus_a", "aggre2_ufs_axi", "ufs_axi"; clocks = <&rpmcc RPM_SMD_AGGR2_NOC_CLK>, - <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>; + <&rpmcc RPM_SMD_AGGR2_NOC_A_CLK>, + <&gcc GCC_AGGRE2_UFS_AXI_CLK>, + <&gcc GCC_UFS_AXI_CLK>; };
mnoc: interconnect@5a4000 {
From: Jan Kara jack@suse.cz
[ Upstream commit 3d2d7e61553dbcc8ba45201d8ae4f383742c8202 ]
Similarly to other filesystems define EFSCORRUPTED error code for reporting internal filesystem corruption.
Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin sashal@kernel.org --- fs/udf/udf_sb.h | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 4fa620543d302..2205859731dc2 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -51,6 +51,8 @@ #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02
+#define EFSCORRUPTED EUCLEAN + struct udf_meta_data { __u32 s_meta_file_loc; __u32 s_mirror_file_loc;
From: Peter Zijlstra peterz@infradead.org
[ Upstream commit 0e26e1de0032779e43929174339429c16307a299 ]
Low level noinstr context-tracking code is calling out to instrumented code on KASAN:
vmlinux.o: warning: objtool: __ct_user_enter+0x72: call to __kasan_check_write() leaves .noinstr.text section vmlinux.o: warning: objtool: __ct_user_exit+0x47: call to __kasan_check_write() leaves .noinstr.text section
Use even lower level atomic methods to avoid the instrumentation.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Link: https://lore.kernel.org/r/20230112195542.458034262@infradead.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/context_tracking.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 77978e3723771..a09f1c19336ae 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -510,7 +510,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Even if context tracking is disabled on this CPU, because it's outside @@ -527,7 +527,7 @@ void noinstr __ct_user_enter(enum ctx_state state) */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, state); + arch_atomic_set(&ct->state, state); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -535,7 +535,7 @@ void noinstr __ct_user_enter(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_add(state, &ct->state); + arch_atomic_add(state, &ct->state); } } } @@ -630,12 +630,12 @@ void noinstr __ct_user_exit(enum ctx_state state) * In this we case we don't care about any concurrency/ordering. */ if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL);
} else { if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { /* Tracking for vtime only, no concurrent RCU EQS accounting */ - atomic_set(&ct->state, CONTEXT_KERNEL); + arch_atomic_set(&ct->state, CONTEXT_KERNEL); } else { /* * Tracking for vtime and RCU EQS. Make sure we don't race @@ -643,7 +643,7 @@ void noinstr __ct_user_exit(enum ctx_state state) * RCU only requires RCU_DYNTICKS_IDX increments to be fully * ordered. */ - atomic_sub(state, &ct->state); + arch_atomic_sub(state, &ct->state); } } }
From: Nicholas Piggin npiggin@gmail.com
[ Upstream commit 001c28e57187570e4b5aa4492c7a957fb6d65d7b ]
If a task oopses with irqs disabled, this can cause various cascading problems in the oops path such as sleep-from-invalid warnings, and potentially worse.
Since commit 0258b5fd7c712 ("coredump: Limit coredumps to a single thread group"), the unconditional irq enable in coredump_task_exit() will "fix" the irq state to be enabled early in do_exit(), so currently this may not be triggerable, but that is coincidental and fragile.
Detect and fix the irqs_disabled() condition in the oops path before calling do_exit(), similarly to the way in_atomic() is handled.
Reported-by: Michael Ellerman mpe@ellerman.id.au Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: "Eric W. Biederman" ebiederm@xmission.com Link: https://lore.kernel.org/lkml/20221004094401.708299-1-npiggin@gmail.com/ Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/exit.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/kernel/exit.c b/kernel/exit.c index 15dc2ec80c467..bccfa4218356e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -807,6 +807,8 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead;
+ WARN_ON(irqs_disabled()); + synchronize_group_exit(tsk, code);
WARN_ON(tsk->plug); @@ -938,6 +940,11 @@ void __noreturn make_task_dead(int signr) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!");
+ if (unlikely(irqs_disabled())) { + pr_info("note: %s[%d] exited with irqs disabled\n", + current->comm, task_pid_nr(current)); + local_irq_enable(); + } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current),
From: Markuss Broks markuss.broks@gmail.com
[ Upstream commit 5d5aa219a790d61cad2c38e1aa32058f16ad2f0b ]
For some reason, the driver adding support for Exynos5420 MIPI phy back in 2016 wasn't used on Exynos5420, which caused a kernel panic. Add the proper compatible for it.
Signed-off-by: Markuss Broks markuss.broks@gmail.com Link: https://lore.kernel.org/r/20230121201844.46872-2-markuss.broks@gmail.com Signed-off-by: Krzysztof Kozlowski krzysztof.kozlowski@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm/boot/dts/exynos5420.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm/boot/dts/exynos5420.dtsi b/arch/arm/boot/dts/exynos5420.dtsi index 9f2523a873d9d..62263eb91b3cc 100644 --- a/arch/arm/boot/dts/exynos5420.dtsi +++ b/arch/arm/boot/dts/exynos5420.dtsi @@ -592,7 +592,7 @@ dp_phy: dp-video-phy { };
mipi_phy: mipi-video-phy { - compatible = "samsung,s5pv210-mipi-video-phy"; + compatible = "samsung,exynos5420-mipi-video-phy"; syscon = <&pmu_system_controller>; #phy-cells = <1>; };
From: Jann Horn jannh@google.com
[ Upstream commit 47d586913f2abec4d240bae33417f537fda987ec ]
Currently, filp_close() and generic_shutdown_super() use printk() to log messages when bugs are detected. This is problematic because infrastructure like syzkaller has no idea that this message indicates a bug. In addition, some people explicitly want their kernels to BUG() when kernel data corruption has been detected (CONFIG_BUG_ON_DATA_CORRUPTION). And finally, when generic_shutdown_super() detects remaining inodes on a system without CONFIG_BUG_ON_DATA_CORRUPTION, it would be nice if later accesses to a busy inode would at least crash somewhat cleanly rather than walking through freed memory.
To address all three, use CHECK_DATA_CORRUPTION() when kernel bugs are detected.
Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Christian Brauner (Microsoft) brauner@kernel.org Reviewed-by: Kees Cook keescook@chromium.org Signed-off-by: Christian Brauner (Microsoft) brauner@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/open.c | 5 +++-- fs/super.c | 21 +++++++++++++++++---- include/linux/poison.h | 3 +++ 3 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/fs/open.c b/fs/open.c index a81319b6177f6..7853deb6fcf47 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1411,8 +1411,9 @@ int filp_close(struct file *filp, fl_owner_t id) { int retval = 0;
- if (!file_count(filp)) { - printk(KERN_ERR "VFS: Close: file count is 0\n"); + if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, + "VFS: Close: file count is 0 (f_op=%ps)", + filp->f_op)) { return 0; }
diff --git a/fs/super.c b/fs/super.c index 8d39e4f11cfa3..4f8a626a35cd9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -491,10 +491,23 @@ void generic_shutdown_super(struct super_block *sb) if (sop->put_super) sop->put_super(sb);
- if (!list_empty(&sb->s_inodes)) { - printk("VFS: Busy inodes after unmount of %s. " - "Self-destruct in 5 seconds. Have a nice day...\n", - sb->s_id); + if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), + "VFS: Busy inodes after unmount of %s (%s)", + sb->s_id, sb->s_type->name)) { + /* + * Adding a proper bailout path here would be hard, but + * we can at least make it more likely that a later + * iput_final() or such crashes cleanly. + */ + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + inode->i_op = VFS_PTR_POISON; + inode->i_sb = VFS_PTR_POISON; + inode->i_mapping = VFS_PTR_POISON; + } + spin_unlock(&sb->s_inode_list_lock); } } spin_lock(&sb_lock); diff --git a/include/linux/poison.h b/include/linux/poison.h index 2d3249eb0e62d..0e8a1f2ceb2f1 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -84,4 +84,7 @@ /********** kernel/bpf/ **********/ #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
+/********** VFS **********/ +#define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) + #endif
From: Li Nan linan122@huawei.com
[ Upstream commit 984af1e66b4126cf145153661cc24c213e2ec231 ]
echo max of u64 to cost.model can cause divide by 0 error.
# echo 8:0 rbps=18446744073709551615 > /sys/fs/cgroup/io.cost.model
divide error: 0000 [#1] PREEMPT SMP RIP: 0010:calc_lcoefs+0x4c/0xc0 Call Trace: <TASK> ioc_refresh_params+0x2b3/0x4f0 ioc_cost_model_write+0x3cb/0x4c0 ? _copy_from_iter+0x6d/0x6c0 ? kernfs_fop_write_iter+0xfc/0x270 cgroup_file_write+0xa0/0x200 kernfs_fop_write_iter+0x17d/0x270 vfs_write+0x414/0x620 ksys_write+0x73/0x160 __x64_sys_write+0x1e/0x30 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd
calc_lcoefs() uses the input value of cost.model in DIV_ROUND_UP_ULL, overflow would happen if bps plus IOC_PAGE_SIZE is greater than ULLONG_MAX, it can cause divide by 0 error.
Fix the problem by setting basecost
Signed-off-by: Li Nan linan122@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20230117070806.3857142-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- block/blk-iocost.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 495396425bade..bfc33fa9a063c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -865,9 +865,14 @@ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
*page = *seqio = *randio = 0;
- if (bps) - *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, - DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE)); + if (bps) { + u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); + + if (bps_pages) + *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); + else + *page = 1; + }
if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
From: Yu Kuai yukuai3@huawei.com
[ Upstream commit c7241babf0855d8a6180cd1743ff0ec34de40b4e ]
Some cgroup policies will access parent pd through child pd even after pd_offline_fn() is done. If pd_free_fn() for parent is called before child, then UAF can be triggered. Hence it's better to guarantee the order of pd_free_fn().
Currently refcount of parent blkg is dropped in __blkg_release(), which is before pd_free_fn() is called in blkg_free_work_fn() while blkg_free_work_fn() is called asynchronously.
This patch make sure pd_free_fn() called from removing cgroup is ordered by delaying dropping parent refcount after calling pd_free_fn() for child.
BTW, pd_free_fn() will also be called from blkcg_deactivate_policy() from deleting device, and following patches will guarantee the order.
Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Link: https://lore.kernel.org/r/20230119110350.2287325-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7c91d9195da8d..8d1b7757f1e4f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -93,6 +93,8 @@ static void blkg_free_workfn(struct work_struct *work) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+ if (blkg->parent) + blkg_put(blkg->parent); if (blkg->q) blk_put_queue(blkg->q); free_percpu(blkg->iostat_cpu); @@ -127,8 +129,6 @@ static void __blkg_release(struct rcu_head *rcu)
/* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); blkg_free(blkg); }
From: Yu Kuai yukuai3@huawei.com
[ Upstream commit f1c006f1c6850c14040f8337753a63119bba39b9 ]
Currently parent pd can be freed before child pd:
t1: remove cgroup C1 blkcg_destroy_blkgs blkg_destroy list_del_init(&blkg->q_node) // remove blkg from queue list percpu_ref_kill(&blkg->refcnt) blkg_release call_rcu
t2: from t1 __blkg_release blkg_free schedule_work t4: deactivate policy blkcg_deactivate_policy pd_free_fn // parent of C1 is freed first t3: from t2 blkg_free_workfn pd_free_fn
If policy(for example, ioc_timer_fn() from iocost) access parent pd from child pd after pd_offline_fn(), then UAF can be triggered.
Fix the problem by delaying 'list_del_init(&blkg->q_node)' from blkg_destroy() to blkg_free_workfn(), and using a new disk level mutex to synchronize blkg_free_workfn() and blkcg_deactivate_policy().
Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Link: https://lore.kernel.org/r/20230119110350.2287325-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- block/blk-cgroup.c | 35 +++++++++++++++++++++++++++++------ include/linux/blkdev.h | 1 + 2 files changed, 30 insertions(+), 6 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8d1b7757f1e4f..f8b21bead6552 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -87,16 +87,32 @@ static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); + struct request_queue *q = blkg->q; int i;
+ /* + * pd_free_fn() can also be called from blkcg_deactivate_policy(), + * in order to make sure pd_free_fn() is called in order, the deletion + * of the list blkg->q_node is delayed to here from blkg_destroy(), and + * blkcg_mutex is used to synchronize blkg_free_workfn() and + * blkcg_deactivate_policy(). + */ + if (q) + mutex_lock(&q->blkcg_mutex); + for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
if (blkg->parent) blkg_put(blkg->parent); - if (blkg->q) - blk_put_queue(blkg->q); + + if (q) { + list_del_init(&blkg->q_node); + mutex_unlock(&q->blkcg_mutex); + blk_put_queue(q); + } + free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); @@ -425,9 +441,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock);
- /* Something wrong if we are trying to remove same group twice */ - WARN_ON_ONCE(list_empty(&blkg->q_node)); - WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + /* + * blkg stays on the queue list until blkg_free_workfn(), see details in + * blkg_free_workfn(), hence this function can be called from + * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before + * blkg_free_workfn(). + */ + if (hlist_unhashed(&blkg->blkcg_node)) + return;
for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -439,7 +460,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) blkg->online = false;
radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); - list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node);
/* @@ -1226,6 +1246,7 @@ int blkcg_init_disk(struct gendisk *disk) int ret;
INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex);
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1463,6 +1484,7 @@ void blkcg_deactivate_policy(struct request_queue *q, if (queue_is_mq(q)) blk_mq_freeze_queue(q);
+ mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock);
__clear_bit(pol->plid, q->blkcg_pols); @@ -1481,6 +1503,7 @@ void blkcg_deactivate_policy(struct request_queue *q, }
spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex);
if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 891f8cbcd0436..1680b6e1e5362 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -487,6 +487,7 @@ struct request_queue { DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); struct blkcg_gq *root_blkg; struct list_head blkg_list; + struct mutex blkcg_mutex; #endif
struct queue_limits limits;
From: Greg Kroah-Hartman gregkh@linuxfoundation.org
[ Upstream commit 83e8864fee26f63a7435e941b7c36a20fd6fe93e ]
When calling debugfs_lookup() the result must have dput() called on it, otherwise the memory will leak over time. To make things simpler, just call debugfs_lookup_and_remove() instead which handles all of the logic at once.
Cc: Jens Axboe axboe@kernel.dk Cc: Steven Rostedt rostedt@goodmis.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-trace-kernel@vger.kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Reviewed-by: Bart Van Assche bvanassche@acm.org Link: https://lore.kernel.org/r/20230202141956.2299521-1-gregkh@linuxfoundation.or... Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/trace/blktrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a66cff5a18579..a5b35bcfb0602 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) * under 'q->debugfs_dir', thus lookup and remove them. */ if (!bt->dir) { - debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); - debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + debugfs_lookup_and_remove("dropped", q->debugfs_dir); + debugfs_lookup_and_remove("msg", q->debugfs_dir); } else { debugfs_remove(bt->dir); }
From: Eric Biggers ebiggers@google.com
[ Upstream commit ec64036e68634231f5891faa2b7a81cdc5dcd001 ]
Now that the key associated with the "test_dummy_operation" mount option is added on-demand when it's needed, rather than immediately when the filesystem is mounted, fscrypt_destroy_keyring() no longer needs to be called from __put_super() to avoid a memory leak on mount failure.
Remove this call, which was causing confusion because it appeared to be a sleep-in-atomic bug (though it wasn't, for a somewhat-subtle reason).
Signed-off-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20230208062107.199831-5-ebiggers@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/super.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/fs/super.c b/fs/super.c index 4f8a626a35cd9..76d47620b930d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -291,7 +291,6 @@ static void __put_super(struct super_block *s) WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); - fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu);
On Sat, Feb 25, 2023 at 10:42:47PM -0500, Sasha Levin wrote:
From: Eric Biggers ebiggers@google.com
[ Upstream commit ec64036e68634231f5891faa2b7a81cdc5dcd001 ]
Now that the key associated with the "test_dummy_operation" mount option is added on-demand when it's needed, rather than immediately when the filesystem is mounted, fscrypt_destroy_keyring() no longer needs to be called from __put_super() to avoid a memory leak on mount failure.
Remove this call, which was causing confusion because it appeared to be a sleep-in-atomic bug (though it wasn't, for a somewhat-subtle reason).
Signed-off-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20230208062107.199831-5-ebiggers@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Why is this being backported?
- Eric
On Sat, Feb 25, 2023 at 08:07:55PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 10:42:47PM -0500, Sasha Levin wrote:
From: Eric Biggers ebiggers@google.com
[ Upstream commit ec64036e68634231f5891faa2b7a81cdc5dcd001 ]
Now that the key associated with the "test_dummy_operation" mount option is added on-demand when it's needed, rather than immediately when the filesystem is mounted, fscrypt_destroy_keyring() no longer needs to be called from __put_super() to avoid a memory leak on mount failure.
Remove this call, which was causing confusion because it appeared to be a sleep-in-atomic bug (though it wasn't, for a somewhat-subtle reason).
Signed-off-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20230208062107.199831-5-ebiggers@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Why is this being backported?
- Eric
BTW, can you please permanently exclude all commits authored by me from AUTOSEL so that I don't have to repeatedly complain about every commit individually? Especially when these mails often come on weekends and holidays.
I know how to use Cc stable, and how to ask explicitly for a stable backport if I find out after the fact that one is needed. (And other real people can always ask too... not counting AUTOSEL, even though you are sending the AUTOSEL emails, since clearly they go through no or very little human review.)
Of course, it's not just me that AUTOSEL isn't working for. So, you'll still continue backporting random commits that I have to spend hours bisecting, e.g. https://lore.kernel.org/stable/20220921155332.234913-7-sashal@kernel.org.
But at least I won't have to deal with this garbage for my own commits.
Now, I'm not sure I'll get a response to this --- I received no response to my last AUTOSEL question at https://lore.kernel.org/stable/Y1DTFiP12ws04eOM@sol.localdomain. So to hopefully entice you to actually do something, I'm also letting you know that I won't be reviewing any AUTOSEL mails for my commits anymore.
- Eric
On Sat, Feb 25, 2023 at 09:30:37PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 08:07:55PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 10:42:47PM -0500, Sasha Levin wrote:
From: Eric Biggers ebiggers@google.com
[ Upstream commit ec64036e68634231f5891faa2b7a81cdc5dcd001 ]
Now that the key associated with the "test_dummy_operation" mount option is added on-demand when it's needed, rather than immediately when the filesystem is mounted, fscrypt_destroy_keyring() no longer needs to be called from __put_super() to avoid a memory leak on mount failure.
Remove this call, which was causing confusion because it appeared to be a sleep-in-atomic bug (though it wasn't, for a somewhat-subtle reason).
Signed-off-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20230208062107.199831-5-ebiggers@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Why is this being backported?
- Eric
BTW, can you please permanently exclude all commits authored by me from AUTOSEL so that I don't have to repeatedly complain about every commit individually? Especially when these mails often come on weekends and holidays.
I know how to use Cc stable, and how to ask explicitly for a stable backport if I find out after the fact that one is needed. (And other real people can always ask too... not counting AUTOSEL, even though you are sending the AUTOSEL emails, since clearly they go through no or very little human review.)
Of course, it's not just me that AUTOSEL isn't working for. So, you'll still continue backporting random commits that I have to spend hours bisecting, e.g. https://lore.kernel.org/stable/20220921155332.234913-7-sashal@kernel.org.
But at least I won't have to deal with this garbage for my own commits.
Now, I'm not sure I'll get a response to this --- I received no response to my last AUTOSEL question at https://lore.kernel.org/stable/Y1DTFiP12ws04eOM@sol.localdomain. So to hopefully entice you to actually do something, I'm also letting you know that I won't be reviewing any AUTOSEL mails for my commits anymore.
The really annoying thing is that someone even replied to your AUTOSEL email for that broken patch and told you it is broken (https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox....), and ***you ignored it and applied the patch anyway***.
Why are you even sending these emails if you are ignoring feedback anyway?
How do I even get you to not apply a patch? Is it even possible?
I guess I might as well just add an email filter that auto-deletes all AUTOSEL emails, as apparently there's no point in responding anyway?
- Eric
On Sun, Feb 26, 2023 at 2:24 PM Eric Biggers ebiggers@kernel.org wrote:
On Sat, Feb 25, 2023 at 09:30:37PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 08:07:55PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 10:42:47PM -0500, Sasha Levin wrote:
From: Eric Biggers ebiggers@google.com
[ Upstream commit ec64036e68634231f5891faa2b7a81cdc5dcd001 ]
Now that the key associated with the "test_dummy_operation" mount option is added on-demand when it's needed, rather than immediately when the filesystem is mounted, fscrypt_destroy_keyring() no longer needs to be called from __put_super() to avoid a memory leak on mount failure.
Remove this call, which was causing confusion because it appeared to be a sleep-in-atomic bug (though it wasn't, for a somewhat-subtle reason).
Signed-off-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20230208062107.199831-5-ebiggers@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Why is this being backported?
- Eric
BTW, can you please permanently exclude all commits authored by me from AUTOSEL so that I don't have to repeatedly complain about every commit individually? Especially when these mails often come on weekends and holidays.
I know how to use Cc stable, and how to ask explicitly for a stable backport if I find out after the fact that one is needed. (And other real people can always ask too... not counting AUTOSEL, even though you are sending the AUTOSEL emails, since clearly they go through no or very little human review.)
Of course, it's not just me that AUTOSEL isn't working for. So, you'll still continue backporting random commits that I have to spend hours bisecting, e.g. https://lore.kernel.org/stable/20220921155332.234913-7-sashal@kernel.org.
But at least I won't have to deal with this garbage for my own commits.
Now, I'm not sure I'll get a response to this --- I received no response to my last AUTOSEL question at https://lore.kernel.org/stable/Y1DTFiP12ws04eOM@sol.localdomain. So to hopefully entice you to actually do something, I'm also letting you know that I won't be reviewing any AUTOSEL mails for my commits anymore.
The really annoying thing is that someone even replied to your AUTOSEL email for that broken patch and told you it is broken (https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox....), and ***you ignored it and applied the patch anyway***.
Why are you even sending these emails if you are ignoring feedback anyway?
How do I even get you to not apply a patch? Is it even possible?
I guess I might as well just add an email filter that auto-deletes all AUTOSEL emails, as apparently there's no point in responding anyway?
I test this branch for Greg but don't pay attention to these emails Sasha sends out (because there's just waaaaay too many of them to look through unless they get a reply; I find them quite annoying otherwise.) But if these commits automatically get applied to stable trees, even with objections from the committers, then I personally question the methodology for having AUTOSEL in the first place. Commits should be tested and backported with explicit purpose by their developers, IMO.
-- Slade
On Sun, Feb 26, 2023 at 11:24:36AM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 09:30:37PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 08:07:55PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 10:42:47PM -0500, Sasha Levin wrote:
From: Eric Biggers ebiggers@google.com
[ Upstream commit ec64036e68634231f5891faa2b7a81cdc5dcd001 ]
Now that the key associated with the "test_dummy_operation" mount option is added on-demand when it's needed, rather than immediately when the filesystem is mounted, fscrypt_destroy_keyring() no longer needs to be called from __put_super() to avoid a memory leak on mount failure.
Remove this call, which was causing confusion because it appeared to be a sleep-in-atomic bug (though it wasn't, for a somewhat-subtle reason).
Signed-off-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20230208062107.199831-5-ebiggers@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Why is this being backported?
- Eric
BTW, can you please permanently exclude all commits authored by me from AUTOSEL so that I don't have to repeatedly complain about every commit individually? Especially when these mails often come on weekends and holidays.
Yup, no problem - I'll ignore any commits authored by you.
I know how to use Cc stable, and how to ask explicitly for a stable backport if I find out after the fact that one is needed. (And other real people can always ask too... not counting AUTOSEL, even though you are sending the AUTOSEL emails, since clearly they go through no or very little human review.)
One of the challanges here is that it's difficult to solicit reviews or really any interaction from authors after a commit lands upstream. Look at the response rates to Greg's "FAILED" emails that ask authors to provide backports to commits they tagged for stable.
Of course, it's not just me that AUTOSEL isn't working for. So, you'll still continue backporting random commits that I have to spend hours bisecting, e.g. https://lore.kernel.org/stable/20220921155332.234913-7-sashal@kernel.org.
But at least I won't have to deal with this garbage for my own commits.
Now, I'm not sure I'll get a response to this --- I received no response to my last AUTOSEL question at https://lore.kernel.org/stable/Y1DTFiP12ws04eOM@sol.localdomain. So to hopefully entice you to actually do something, I'm also letting you know that I won't be reviewing any AUTOSEL mails for my commits anymore.
The really annoying thing is that someone even replied to your AUTOSEL email for that broken patch and told you it is broken (https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox....), and ***you ignored it and applied the patch anyway***.
Why are you even sending these emails if you are ignoring feedback anyway?
I obviously didn't ignore it on purpose, right?
On Mon, Feb 27, 2023 at 09:18:59AM -0500, Sasha Levin wrote:
On Sun, Feb 26, 2023 at 11:24:36AM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 09:30:37PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 08:07:55PM -0800, Eric Biggers wrote:
On Sat, Feb 25, 2023 at 10:42:47PM -0500, Sasha Levin wrote:
From: Eric Biggers ebiggers@google.com
[ Upstream commit ec64036e68634231f5891faa2b7a81cdc5dcd001 ]
Now that the key associated with the "test_dummy_operation" mount option is added on-demand when it's needed, rather than immediately when the filesystem is mounted, fscrypt_destroy_keyring() no longer needs to be called from __put_super() to avoid a memory leak on mount failure.
Remove this call, which was causing confusion because it appeared to be a sleep-in-atomic bug (though it wasn't, for a somewhat-subtle reason).
Signed-off-by: Eric Biggers ebiggers@google.com Link: https://lore.kernel.org/r/20230208062107.199831-5-ebiggers@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Why is this being backported?
- Eric
BTW, can you please permanently exclude all commits authored by me from AUTOSEL so that I don't have to repeatedly complain about every commit individually? Especially when these mails often come on weekends and holidays.
Yup, no problem - I'll ignore any commits authored by you.
I know how to use Cc stable, and how to ask explicitly for a stable backport if I find out after the fact that one is needed. (And other real people can always ask too... not counting AUTOSEL, even though you are sending the AUTOSEL emails, since clearly they go through no or very little human review.)
One of the challanges here is that it's difficult to solicit reviews or really any interaction from authors after a commit lands upstream. Look at the response rates to Greg's "FAILED" emails that ask authors to provide backports to commits they tagged for stable.
Well, it doesn't help that most of the stable emails aren't sent to the subsystem's mailing list, but instead just to the individual people mentioned in the commit. So many people who would like to help never know about it.
Of course, it's not just me that AUTOSEL isn't working for. So, you'll still continue backporting random commits that I have to spend hours bisecting, e.g. https://lore.kernel.org/stable/20220921155332.234913-7-sashal@kernel.org.
But at least I won't have to deal with this garbage for my own commits.
Now, I'm not sure I'll get a response to this --- I received no response to my last AUTOSEL question at https://lore.kernel.org/stable/Y1DTFiP12ws04eOM@sol.localdomain. So to hopefully entice you to actually do something, I'm also letting you know that I won't be reviewing any AUTOSEL mails for my commits anymore.
The really annoying thing is that someone even replied to your AUTOSEL email for that broken patch and told you it is broken (https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox....), and ***you ignored it and applied the patch anyway***.
Why are you even sending these emails if you are ignoring feedback anyway?
I obviously didn't ignore it on purpose, right?
I don't know, is it obvious? You've said in the past that sometimes you'd like to backport a commit even if the maintainer objects and/or it is known buggy. https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox.... also didn't explicitly say "Don't backport this" but instead "This patch has issues", so maybe that made a difference?
Anyway, the fact is that it happened. And if it happened in the one bug that I happened to look at because it personally affected me and I spent hours bisecting, it probably is happening in lots of other cases too. So it seems the process is not working...
Separately from responses to the AUTOSEL email, it also seems that you aren't checking for any reported regressions or pending fixes for a commit before backporting it. Simply searching lore for the commit title https://lore.kernel.org/all/?q=%22drm%2Famdgpu%3A+use+dirty+framebuffer+help... would have turned up the bug report https://lore.kernel.org/dri-devel/20220918120926.10322-1-user@am64/ that bisected a regression to that commit, as well as a patch that Fixes that commit: https://lore.kernel.org/all/20220920130832.2214101-1-alexander.deucher@amd.c... Both of these existed before you even sent the AUTOSEL email!
So to summarize, that buggy commit was backported even though:
* There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place. * On the AUTOSEL thread, someone told you the commit is broken. * There was already a thread that reported a regression caused by the commit. Easily findable via lore search. * There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
- Eric
On Mon, Feb 27, 2023 at 09:47:48AM -0800, Eric Biggers wrote:
Of course, it's not just me that AUTOSEL isn't working for. So, you'll still continue backporting random commits that I have to spend hours bisecting, e.g. https://lore.kernel.org/stable/20220921155332.234913-7-sashal@kernel.org.
But at least I won't have to deal with this garbage for my own commits.
Now, I'm not sure I'll get a response to this --- I received no response to my last AUTOSEL question at https://lore.kernel.org/stable/Y1DTFiP12ws04eOM@sol.localdomain. So to hopefully entice you to actually do something, I'm also letting you know that I won't be reviewing any AUTOSEL mails for my commits anymore.
The really annoying thing is that someone even replied to your AUTOSEL email for that broken patch and told you it is broken (https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox....), and ***you ignored it and applied the patch anyway***.
Why are you even sending these emails if you are ignoring feedback anyway?
I obviously didn't ignore it on purpose, right?
I don't know, is it obvious? You've said in the past that sometimes you'd like to backport a commit even if the maintainer objects and/or it is known buggy. https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox.... also didn't explicitly say "Don't backport this" but instead "This patch has issues", so maybe that made a difference?
Anyway, the fact is that it happened. And if it happened in the one bug that I happened to look at because it personally affected me and I spent hours bisecting, it probably is happening in lots of other cases too. So it seems the process is not working...
Separately from responses to the AUTOSEL email, it also seems that you aren't checking for any reported regressions or pending fixes for a commit before backporting it. Simply searching lore for the commit title https://lore.kernel.org/all/?q=%22drm%2Famdgpu%3A+use+dirty+framebuffer+help... would have turned up the bug report https://lore.kernel.org/dri-devel/20220918120926.10322-1-user@am64/ that bisected a regression to that commit, as well as a patch that Fixes that commit: https://lore.kernel.org/all/20220920130832.2214101-1-alexander.deucher@amd.c... Both of these existed before you even sent the AUTOSEL email!
So to summarize, that buggy commit was backported even though:
- There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place.
- On the AUTOSEL thread, someone told you the commit is broken.
- There was already a thread that reported a regression caused by the commit. Easily findable via lore search.
- There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
- Eric
On Mon, Feb 27, 2023 at 10:06:32AM -0800, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 09:47:48AM -0800, Eric Biggers wrote:
Of course, it's not just me that AUTOSEL isn't working for. So, you'll still continue backporting random commits that I have to spend hours bisecting, e.g. https://lore.kernel.org/stable/20220921155332.234913-7-sashal@kernel.org.
But at least I won't have to deal with this garbage for my own commits.
Now, I'm not sure I'll get a response to this --- I received no response to my last AUTOSEL question at https://lore.kernel.org/stable/Y1DTFiP12ws04eOM@sol.localdomain. So to hopefully entice you to actually do something, I'm also letting you know that I won't be reviewing any AUTOSEL mails for my commits anymore.
The really annoying thing is that someone even replied to your AUTOSEL email for that broken patch and told you it is broken (https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox....), and ***you ignored it and applied the patch anyway***.
Why are you even sending these emails if you are ignoring feedback anyway?
I obviously didn't ignore it on purpose, right?
I don't know, is it obvious? You've said in the past that sometimes you'd like to backport a commit even if the maintainer objects and/or it is known buggy. https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox.... also didn't explicitly say "Don't backport this" but instead "This patch has issues", so maybe that made a difference?
From what I gather I missed the reply - I would not blindly ignore a maintainer.
Anyway, the fact is that it happened. And if it happened in the one bug that I happened to look at because it personally affected me and I spent hours bisecting, it probably is happening in lots of other cases too. So it seems the process is not working...
This one is tricky, becuase we also end up taking a lot of commits that do fix real bugs, and were never tagged for stable or even had a fixes tag.
Maybe I should run the numbers again, but when we compared regression rates of stable tagged releases and AUTOSEL ones, it was fairly identical.
Separately from responses to the AUTOSEL email, it also seems that you aren't checking for any reported regressions or pending fixes for a commit before backporting it. Simply searching lore for the commit title https://lore.kernel.org/all/?q=%22drm%2Famdgpu%3A+use+dirty+framebuffer+help... would have turned up the bug report https://lore.kernel.org/dri-devel/20220918120926.10322-1-user@am64/ that bisected a regression to that commit, as well as a patch that Fixes that commit: https://lore.kernel.org/all/20220920130832.2214101-1-alexander.deucher@amd.c... Both of these existed before you even sent the AUTOSEL email!
I would love to have a way to automatically grep lore for reported issues that are pinpointed to a given commit. I'm hoping that Thorsten's regression tracker could be used that way soon enough.
So to summarize, that buggy commit was backported even though:
- There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place.
- On the AUTOSEL thread, someone told you the commit is broken.
- There was already a thread that reported a regression caused by the commit. Easily findable via lore search.
- There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
I would love to have a mechanism that tells me with 100% confidence if a given commit fixes a bug or not, could you provide me with one?
w.r.t timelines, this is something that was discussed on the mailing list a few years ago where we decided that giving AUTOSEL commits 7 days of soaking time is sufficient, if anything changed we can have this discussion again.
Note, however, that it's not enough to keep pointing at a tiny set and using it to suggest that the entire process is broken. How many AUTOSEL commits introduced a regression? How many -stable tagged ones did? How many bugs did AUTOSEL commits fix?
On Mon, Feb 27, 2023 at 03:39:14PM -0500, Sasha Levin wrote:
So to summarize, that buggy commit was backported even though:
- There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place.
- On the AUTOSEL thread, someone told you the commit is broken.
- There was already a thread that reported a regression caused by the commit. Easily findable via lore search.
- There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
I would love to have a mechanism that tells me with 100% confidence if a given commit fixes a bug or not, could you provide me with one?
Just because you can't be 100% certain whether a commit is a fix doesn't mean you should be rushing to backport random commits that have no indications they are fixing anything.
w.r.t timelines, this is something that was discussed on the mailing list a few years ago where we decided that giving AUTOSEL commits 7 days of soaking time is sufficient, if anything changed we can have this discussion again.
Nothing has changed, but that doesn't mean that your process is actually working. 7 days might be appropriate for something that looks like a security fix, but not for a random commit with no indications it is fixing anything.
BTW, based on that example it's not even 7 days between AUTOSEL and patch applied, but actually 7 days from AUTOSEL to *release*. So e.g. if someone takes just a 1 week vacation, in that time a commit they would have NAK'ed can be AUTOSEL'ed and pushed out across all LTS kernels...
Note, however, that it's not enough to keep pointing at a tiny set and using it to suggest that the entire process is broken. How many AUTOSEL commits introduced a regression? How many -stable tagged ones did? How many bugs did AUTOSEL commits fix?
So basically you don't accept feedback from individual people, as individual people don't have enough data?
- Eric
On Mon, Feb 27, 2023 at 09:38:46PM +0000, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 03:39:14PM -0500, Sasha Levin wrote:
So to summarize, that buggy commit was backported even though:
- There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place.
- On the AUTOSEL thread, someone told you the commit is broken.
- There was already a thread that reported a regression caused by the commit. Easily findable via lore search.
- There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
I would love to have a mechanism that tells me with 100% confidence if a given commit fixes a bug or not, could you provide me with one?
Just because you can't be 100% certain whether a commit is a fix doesn't mean you should be rushing to backport random commits that have no indications they are fixing anything.
The difference in opinion here is that I don't think it's rushing: the stable kernel rules say a commit must be in a released kernel, while the AUTOSEL timelines make it so a commit must have been in two released kernels.
Should we extend it? Maybe, I just don't think we have enough data to make a decision either way.
w.r.t timelines, this is something that was discussed on the mailing list a few years ago where we decided that giving AUTOSEL commits 7 days of soaking time is sufficient, if anything changed we can have this discussion again.
Nothing has changed, but that doesn't mean that your process is actually working. 7 days might be appropriate for something that looks like a security fix, but not for a random commit with no indications it is fixing anything.
How do we know if this is working or not though? How do you quantify the amount of useful commits?
How do you know if a certain fix has security implications? Or even if it actually fixes anything? For every "security" commit tagged for stable I could probably list a "security" commit with no tags whatsoever.
BTW, based on that example it's not even 7 days between AUTOSEL and patch applied, but actually 7 days from AUTOSEL to *release*. So e.g. if someone takes just a 1 week vacation, in that time a commit they would have NAK'ed can be AUTOSEL'ed and pushed out across all LTS kernels...
Right, and same as above: what's "enough"?
Note, however, that it's not enough to keep pointing at a tiny set and using it to suggest that the entire process is broken. How many AUTOSEL commits introduced a regression? How many -stable tagged ones did? How many bugs did AUTOSEL commits fix?
So basically you don't accept feedback from individual people, as individual people don't have enough data?
I'd love to improve the process, but for that we need to figure out criteria for what we consider good or bad, collect data, and make decisions based on that data.
What I'm getting from this thread is a few anecdotal examples and statements that the process isn't working at all.
I took Jon's stablefixes script which he used for his previous articles around stable kernel regressions (here: https://lwn.net/Articles/812231/) and tried running it on the 5.15 stable tree (just a random pick). I've proceeded with ignoring the non-user-visible regressions as Jon defined in his article (basically issues that were introduced and fixed in the same releases) and ended up with 604 commits that caused a user visible regression.
Out of those 604 commits:
- 170 had an explicit stable tag. - 434 did not have a stable tag.
Looking at the commits in the 5.15 tree:
With stable tag:
$ git log --oneline -i --grep "cc.*stable" v5.15..stable/linux-5.15.y | wc -l 3676
Without stable tag (-96 commits which are version bumps):
$ git log --oneline --invert-grep -i --grep "cc.*stable" v5.15..stable/linux-5.15.y | wc -l 10649
Regression rate for commits with stable tag: 170 / 3676 = 4.62% Regression rate for commits without a stable tag: 434 / 10553 = 4.11%
Is the analysis flawed somehow? Probably, and I'd happy take feedback on how/what I can do better, but this type of analysis is what I look for to know if the process is working well or not.
On Mon, Feb 27, 2023 at 05:35:30PM -0500, Sasha Levin wrote:
On Mon, Feb 27, 2023 at 09:38:46PM +0000, Eric Biggers wrote:
Just because you can't be 100% certain whether a commit is a fix doesn't mean you should be rushing to backport random commits that have no indications they are fixing anything.
The difference in opinion here is that I don't think it's rushing: the stable kernel rules say a commit must be in a released kernel, while the AUTOSEL timelines make it so a commit must have been in two released kernels.
Patches in -rc1 have been in _no_ released kernels. I'd feel a lot better about AUTOSEL if it didn't pick up changes until, say, -rc4, unless they were cc'd to stable.
Nothing has changed, but that doesn't mean that your process is actually working. 7 days might be appropriate for something that looks like a security fix, but not for a random commit with no indications it is fixing anything.
How do we know if this is working or not though? How do you quantify the amount of useful commits?
Sasha, 7 days is too short. People have to be allowed to take holiday.
I'd love to improve the process, but for that we need to figure out criteria for what we consider good or bad, collect data, and make decisions based on that data.
What I'm getting from this thread is a few anecdotal examples and statements that the process isn't working at all.
I took Jon's stablefixes script which he used for his previous articles around stable kernel regressions (here: https://lwn.net/Articles/812231/) and tried running it on the 5.15 stable tree (just a random pick). I've proceeded with ignoring the non-user-visible regressions as Jon defined in his article (basically issues that were introduced and fixed in the same releases) and ended up with 604 commits that caused a user visible regression.
Out of those 604 commits:
- 170 had an explicit stable tag.
- 434 did not have a stable tag.
I think a lot of people don't realise they have to _both_ put a Fixes tag _and_ add a Cc: stable. How many of those 604 commits had a Fixes tag?
On Mon, Feb 27, 2023 at 10:59:27PM +0000, Matthew Wilcox wrote:
On Mon, Feb 27, 2023 at 05:35:30PM -0500, Sasha Levin wrote:
On Mon, Feb 27, 2023 at 09:38:46PM +0000, Eric Biggers wrote:
Just because you can't be 100% certain whether a commit is a fix doesn't mean you should be rushing to backport random commits that have no indications they are fixing anything.
The difference in opinion here is that I don't think it's rushing: the stable kernel rules say a commit must be in a released kernel, while the AUTOSEL timelines make it so a commit must have been in two released kernels.
Patches in -rc1 have been in _no_ released kernels. I'd feel a lot better about AUTOSEL if it didn't pick up changes until, say, -rc4, unless they were cc'd to stable.
This happened before my time, but -rc are considered releases.
The counter point to your argument/ask is that if you run the numbers on regressions between -rc releases, it's the later one that tend to introduce (way) more issues.
I've actually written about it a few years back to ksummit discuss (here: https://lwn.net/Articles/753329/) because the numbers I saw indicate that later -rc releases are 3x likely to introduce a regression.
Linus pushed back on it saying that it is "by design" because those commits are way more complex than ones that land during the early -rc cycles.
So yes, I don't mind modifying the release workflow to decrease the regressions we introduce, but I think that there's a difference between what folks see as "helpful" and the outcome it would have.
Nothing has changed, but that doesn't mean that your process is actually working. 7 days might be appropriate for something that looks like a security fix, but not for a random commit with no indications it is fixing anything.
How do we know if this is working or not though? How do you quantify the amount of useful commits?
Sasha, 7 days is too short. People have to be allowed to take holiday.
That's true, and I don't have strong objections to making it longer. How often did it happen though? We don't end up getting too many replies past the 7 day window.
I'll bump it to 14 days for a few months and see if it changes anything.
I'd love to improve the process, but for that we need to figure out criteria for what we consider good or bad, collect data, and make decisions based on that data.
What I'm getting from this thread is a few anecdotal examples and statements that the process isn't working at all.
I took Jon's stablefixes script which he used for his previous articles around stable kernel regressions (here: https://lwn.net/Articles/812231/) and tried running it on the 5.15 stable tree (just a random pick). I've proceeded with ignoring the non-user-visible regressions as Jon defined in his article (basically issues that were introduced and fixed in the same releases) and ended up with 604 commits that caused a user visible regression.
Out of those 604 commits:
- 170 had an explicit stable tag.
- 434 did not have a stable tag.
I think a lot of people don't realise they have to _both_ put a Fixes tag _and_ add a Cc: stable. How many of those 604 commits had a Fixes tag?
What do you mean? Just a cc: stable tag is enough to land it in stable, you don't have to do both. The numbers above reflect that.
Running the numbers, there are 9422 commits with a Fixes tag in the 5.15 tree, out of which 360 had a regression, so 360 / 9422 = 3.82%.
On Mon, Feb 27, 2023 at 07:52:39PM -0500, Sasha Levin wrote:
Nothing has changed, but that doesn't mean that your process is actually working. 7 days might be appropriate for something that looks like a security fix, but not for a random commit with no indications it is fixing anything.
How do we know if this is working or not though? How do you quantify the amount of useful commits?
Sasha, 7 days is too short. People have to be allowed to take holiday.
That's true, and I don't have strong objections to making it longer. How often did it happen though? We don't end up getting too many replies past the 7 day window.
I'll bump it to 14 days for a few months and see if it changes anything.
It's not just for the review time, but also for the longer soak time in mainline.
Of course, for that to work properly you have to actually take advantage of it, for example by re-checking for fixes when actually applying the patch, not just when sending the initial AUTOSEL email. Which I hope you're doing already, but who knows.
- Eric
On Tue, Feb 28, 2023 at 01:25:57AM +0000, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 07:52:39PM -0500, Sasha Levin wrote:
Nothing has changed, but that doesn't mean that your process is actually working. 7 days might be appropriate for something that looks like a security fix, but not for a random commit with no indications it is fixing anything.
How do we know if this is working or not though? How do you quantify the amount of useful commits?
Sasha, 7 days is too short. People have to be allowed to take holiday.
That's true, and I don't have strong objections to making it longer. How often did it happen though? We don't end up getting too many replies past the 7 day window.
I'll bump it to 14 days for a few months and see if it changes anything.
It's not just for the review time, but also for the longer soak time in mainline.
There's a tradeoff to find. I'm sure there are way many more stable users than mainline users. Does this mean we have to break stable from time to time to detect regressions ? Sadly, yes. And it's not specific to Linux, it's the same for virtually any other project that supports maintenance branches. I personally like the principle of delaying backports to older branches so that users relying on much older branches know they're having a much more stable experience because the rare regressions that happen in faster moving branches have more time to be caught. But that requires incredibly more complex management.
On another project I'm sometimes seeing regressions caused by fixes pop up after 6 months of exposure in stable. And comparatively, regressions caused by new features tend to pop up faster, and users occasionally face such bugs in stable just because the backport got delayed. So there's no perfect balance, the problem is that any code change must be executed in field a few times to know if it's solid or not. The larger the exposure (i.e. stable) the faster regressions will be reported. The more frequent the code is triggered, the faster as well. Fixes for bugs that are very hard to trigger can cause regressions that will take ages to be reported. But nonetheless users want these fixes because most of the time they are correct.
When I was maintaining extended LTS kernels such as 2.6.32 or 3.10, users were mostly upgrading when they were waiting for a specific fix. And even there, despite patches having being present in regular stable kernels for months, we faced regressions. Sometimes a fix was not suitable for that branch, sometimes it was incorrectly backported, etc. What's certain however, is that the longer you wait for a backport, the more difficult it becomes to find someone who still remembers well about that fix and its specificities, even during the review. This really has to be taken into account when suggesting increased delays.
I really think that it's important to get backports early in recent stable branches. E.g. we could keep these 7 days till the last LTS branch, and let them cook one or two extra weeks before reaching older releases. But we wouldn't want to delay important fixes too much (which are still likely to cause regressions, especially security fixes which tend to focus on a specifically reported case). Maybe we could imagine that the Cc: stable could have a variant to mean "urgent" for important fixes that we want to bypass the wait time, and that by default other ones would flow a bit more slowly. This could satisfy most users by staying on the branch that brings them the update rate they need. But that would certainly be quite some extra work for Greg and for reviewers and that's certainly not cool, so we need to be reasonable here as well.
Just my two cents, Willy
Hi Sasha,
On Mon, Feb 27, 2023 at 07:52:39PM -0500, Sasha Levin wrote:
Sasha, 7 days is too short. People have to be allowed to take holiday.
That's true, and I don't have strong objections to making it longer. How often did it happen though? We don't end up getting too many replies past the 7 day window.
I'll bump it to 14 days for a few months and see if it changes anything.
I see that for recent AUTOSEL patches you're still using 7 days. In fact, it seems you may have even decreased it further to 5 days:
Sent Mar 14: https://lore.kernel.org/stable/20230314124435.471553-2-sashal@kernel.org Commited Mar 19: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/comm...
Any update on your plan to increase it to 14 days?
I hope you can understand why I have to ask you --- you are the only person who can change your own policy.
Thanks,
- Eric
On Thu, Mar 30, 2023 at 12:08:01AM +0000, Eric Biggers wrote:
Hi Sasha,
On Mon, Feb 27, 2023 at 07:52:39PM -0500, Sasha Levin wrote:
Sasha, 7 days is too short. People have to be allowed to take holiday.
That's true, and I don't have strong objections to making it longer. How often did it happen though? We don't end up getting too many replies past the 7 day window.
I'll bump it to 14 days for a few months and see if it changes anything.
I see that for recent AUTOSEL patches you're still using 7 days. In fact, it seems you may have even decreased it further to 5 days:
Sent Mar 14: https://lore.kernel.org/stable/20230314124435.471553-2-sashal@kernel.org Commited Mar 19: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/comm...
Any update on your plan to increase it to 14 days?
The commit you've pointed to was merged into Linus's tree on Feb 28th, and the first LTS tree that it was released in went out on March 22nd.
Quoting the concern you've raised around the process:
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
So now there's at least 14 days between mainline inclusion and a release in an LTS kernel, does that not conform with what you thought I'd be doing?
Most of that additional time comes in the form of me going over the tree and sending AUTOSEL mails a bit later, so I would be able to also pick follow-up fixes as they come in (and drop stuff that were reverted).
As a side note, inclusion into the stable-queue which you've pointed to above is a few steps before a release - it's mostly a cheap way for us to get mileage out of bots that run on the queue and address issues - it doesn't mean that the commit is released.
On Thu, Mar 30, 2023 at 10:05:39AM -0400, Sasha Levin wrote:
On Thu, Mar 30, 2023 at 12:08:01AM +0000, Eric Biggers wrote:
Hi Sasha,
On Mon, Feb 27, 2023 at 07:52:39PM -0500, Sasha Levin wrote:
Sasha, 7 days is too short. People have to be allowed to take holiday.
That's true, and I don't have strong objections to making it longer. How often did it happen though? We don't end up getting too many replies past the 7 day window.
I'll bump it to 14 days for a few months and see if it changes anything.
I see that for recent AUTOSEL patches you're still using 7 days. In fact, it seems you may have even decreased it further to 5 days:
Sent Mar 14: https://lore.kernel.org/stable/20230314124435.471553-2-sashal@kernel.org Commited Mar 19: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/comm...
Any update on your plan to increase it to 14 days?
The commit you've pointed to was merged into Linus's tree on Feb 28th, and the first LTS tree that it was released in went out on March 22nd.
Quoting the concern you've raised around the process:
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
So now there's at least 14 days between mainline inclusion and a release in an LTS kernel, does that not conform with what you thought I'd be doing?
Not quite. There are actually two different time periods:
1. The time from mainline to release 2. The time from AUTOSEL email to release
(1) is a superset of (2), but concerns were raised about *both* time periods being too short. Especially (1), but also (2) because reviewers can miss the 7-day review e.g. if they are on vacation for a week. Yes, they can of course miss non-AUTOSEL patches too, *if* they happen to get merged quickly enough (most kernel patches take several weeks just to get to mainline). But, AUTOSEL patches are known to be low quality submissions that really need that review.
I'm glad to hear that you've increased (1) to 14 days! However, that does not address (2). It also does not feel like much of a difference, since 12 days for (1) already seemed too short.
To be honest, I hesitate a bit to give you a precise suggestion, as it's liable to be used to push back on future suggestions as "this is what people agreed on before". (Just as you did in this thread, with saying 7 days had been agreed on before.) And it's not like there are any magic numbers -- we just know that the current periods seem to be too short. But, for a simple change, I think increasing (2) to 14 days would be reasonable, as that automatically gives 14 days for (1) too. If it isn't too much trouble to separate the periods, though, it would also be reasonable to choose something a bit higher for (1), like 18-21 days, and something a bit lower for (2), like 10-12 days.
- Eric
On Thu, Mar 30, 2023 at 10:22:10AM -0700, Eric Biggers wrote:
On Thu, Mar 30, 2023 at 10:05:39AM -0400, Sasha Levin wrote:
On Thu, Mar 30, 2023 at 12:08:01AM +0000, Eric Biggers wrote:
Hi Sasha,
On Mon, Feb 27, 2023 at 07:52:39PM -0500, Sasha Levin wrote:
Sasha, 7 days is too short. People have to be allowed to take holiday.
That's true, and I don't have strong objections to making it longer. How often did it happen though? We don't end up getting too many replies past the 7 day window.
I'll bump it to 14 days for a few months and see if it changes anything.
I see that for recent AUTOSEL patches you're still using 7 days. In fact, it seems you may have even decreased it further to 5 days:
Sent Mar 14: https://lore.kernel.org/stable/20230314124435.471553-2-sashal@kernel.org Commited Mar 19: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/comm...
Any update on your plan to increase it to 14 days?
The commit you've pointed to was merged into Linus's tree on Feb 28th, and the first LTS tree that it was released in went out on March 22nd.
Quoting the concern you've raised around the process:
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
So now there's at least 14 days between mainline inclusion and a release in an LTS kernel, does that not conform with what you thought I'd be doing?
Not quite. There are actually two different time periods:
- The time from mainline to release
- The time from AUTOSEL email to release
(1) is a superset of (2), but concerns were raised about *both* time periods being too short. Especially (1), but also (2) because reviewers can miss the 7-day review e.g. if they are on vacation for a week. Yes, they can of course miss non-AUTOSEL patches too, *if* they happen to get merged quickly enough (most kernel patches take several weeks just to get to mainline). But, AUTOSEL patches are known to be low quality submissions that really need that review.
I'm glad to hear that you've increased (1) to 14 days! However, that does not address (2). It also does not feel like much of a difference, since 12 days for (1) already seemed too short.
To be honest, I hesitate a bit to give you a precise suggestion, as it's liable to be used to push back on future suggestions as "this is what people agreed on before". (Just as you did in this thread, with saying 7 days had been agreed on before.) And it's not like there are any magic numbers -- we just know that the current periods seem to be too short. But, for a simple change, I think increasing (2) to 14 days would be reasonable, as that automatically gives 14 days for (1) too. If it isn't too much trouble to separate the periods, though, it would also be reasonable to choose something a bit higher for (1), like 18-21 days, and something a bit lower for (2), like 10-12 days.
No objection on my end, I can enforce 18+ days for (1) and 14+ days for (2).
I'd note that this isn't too far from what happened in the example in the previous mail:
(1) happened in 23 days. (2) happened in 9 days.
On Mon, Feb 27, 2023 at 05:35:30PM -0500, Sasha Levin wrote:
Note, however, that it's not enough to keep pointing at a tiny set and using it to suggest that the entire process is broken. How many AUTOSEL commits introduced a regression? How many -stable tagged ones did? How many bugs did AUTOSEL commits fix?
So basically you don't accept feedback from individual people, as individual people don't have enough data?
I'd love to improve the process, but for that we need to figure out criteria for what we consider good or bad, collect data, and make decisions based on that data.
What I'm getting from this thread is a few anecdotal examples and statements that the process isn't working at all.
I took Jon's stablefixes script which he used for his previous articles around stable kernel regressions (here: https://lwn.net/Articles/812231/) and tried running it on the 5.15 stable tree (just a random pick). I've proceeded with ignoring the non-user-visible regressions as Jon defined in his article (basically issues that were introduced and fixed in the same releases) and ended up with 604 commits that caused a user visible regression.
Out of those 604 commits:
- 170 had an explicit stable tag.
- 434 did not have a stable tag.
Looking at the commits in the 5.15 tree:
With stable tag:
$ git log --oneline -i --grep "cc.*stable" v5.15..stable/linux-5.15.y | wc -l 3676
Without stable tag (-96 commits which are version bumps):
$ git log --oneline --invert-grep -i --grep "cc.*stable" v5.15..stable/linux-5.15.y | wc -l 10649
Regression rate for commits with stable tag: 170 / 3676 = 4.62% Regression rate for commits without a stable tag: 434 / 10553 = 4.11%
Is the analysis flawed somehow? Probably, and I'd happy take feedback on how/what I can do better, but this type of analysis is what I look for to know if the process is working well or not.
I'm shocked that these are the statistics you use to claim the current AUTOSEL process is working. I think they actually show quite the opposite!
First, since many AUTOSEL commits aren't actually fixes but nearly all stable-tagged commits *are* fixes, the rate of regressions per commit would need to be lower for AUTOSEL commits than for stable-tagged commits in order for AUTOSEL commits to have the same rate of regressions *per fix*. Your numbers suggest a similar regression rate *per commit*. Thus, AUTOSEL probably introduces more regressions *per fix* than stable-tagged commits.
Second, the way you're identifying regression-introducing commits seems to be excluding one of the most common, maybe *the* most common, cause of AUTOSEL regressions: missing prerequisite commits. A very common case that I've seen repeatedly is AUTOSEL picking just patch 2 or higher of a multi-patch series. For an example, see the patch that started this thread... If a missing prerequisite is backported later, my understanding is that it usually isn't given a Fixes tag, as the upstream commit didn't have it. I think such regressions aren't counted in your statistic, which only looks at Fixes tags.
(Of course, stable-tagged commits sometimes have missing prerequisite bugs too. But it's expected to be at a lower rate, since the original developers and maintainers are directly involved in adding the stable tags. These are the people who are more familiar than anyone else with prerequisites.)
Third, the category "commits without a stable tag" doesn't include just AUTOSEL commits, but also non-AUTOSEL commits that people asked to be added to stable because they fixed a problem for them. Such commits often have been in mainline for a long time, so naturally they're expected to have a lower regression rate than stable-tagged commits due to the longer soak time, on average. So if the regression rate of stable-tagged and non-stable-tagged commits is actually similar, that suggests the regression rate of non-stable-tagged commits is being brought up artifically by a high regression rate in AUTOSEL commits...
So, I think your statistics actually reflect quite badly on AUTOSEL in its current form.
By the way, to be clear, AUTOSEL is absolutely needed. The way you are doing it currently is not working well, though. I think it needs to be tuned to select fewer, higher-confidence fixes, and you need to do some basic checks against each one, like "does this commit have a pending fix" and "is this commit part of a multi-patch series, and if so are earlier patches needed as prerequisites". There also needs to be more soak time in mainline, and more review time.
IMO you also need to take a hard look at whatever neural network thing you are using, as from what I've seen its results are quite poor... It does pick up some obvious fixes, but it seems they could have just as easily been found through some heuristics with grep. Beyond those obvious fixes, what it picks up seems to be barely distinguishable from a random selection.
- Eric
On Tue, Feb 28, 2023 at 12:32:20AM +0000, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 05:35:30PM -0500, Sasha Levin wrote:
Note, however, that it's not enough to keep pointing at a tiny set and using it to suggest that the entire process is broken. How many AUTOSEL commits introduced a regression? How many -stable tagged ones did? How many bugs did AUTOSEL commits fix?
So basically you don't accept feedback from individual people, as individual people don't have enough data?
I'd love to improve the process, but for that we need to figure out criteria for what we consider good or bad, collect data, and make decisions based on that data.
What I'm getting from this thread is a few anecdotal examples and statements that the process isn't working at all.
I took Jon's stablefixes script which he used for his previous articles around stable kernel regressions (here: https://lwn.net/Articles/812231/) and tried running it on the 5.15 stable tree (just a random pick). I've proceeded with ignoring the non-user-visible regressions as Jon defined in his article (basically issues that were introduced and fixed in the same releases) and ended up with 604 commits that caused a user visible regression.
Out of those 604 commits:
- 170 had an explicit stable tag.
- 434 did not have a stable tag.
Looking at the commits in the 5.15 tree:
With stable tag:
$ git log --oneline -i --grep "cc.*stable" v5.15..stable/linux-5.15.y | wc -l 3676
Without stable tag (-96 commits which are version bumps):
$ git log --oneline --invert-grep -i --grep "cc.*stable" v5.15..stable/linux-5.15.y | wc -l 10649
Regression rate for commits with stable tag: 170 / 3676 = 4.62% Regression rate for commits without a stable tag: 434 / 10553 = 4.11%
Is the analysis flawed somehow? Probably, and I'd happy take feedback on how/what I can do better, but this type of analysis is what I look for to know if the process is working well or not.
I'm shocked that these are the statistics you use to claim the current AUTOSEL process is working. I think they actually show quite the opposite!
First, since many AUTOSEL commits aren't actually fixes but nearly all stable-tagged commits *are* fixes, the rate of regressions per commit would need to be lower for AUTOSEL commits than for stable-tagged commits in order for AUTOSEL commits to have the same rate of regressions *per fix*. Your numbers suggest a similar regression rate *per commit*. Thus, AUTOSEL probably introduces more regressions *per fix* than stable-tagged commits.
Interesting claim. How many of the AUTOSEL commits are "actual" fixes? How do you know if a commit is a fix for anything or not?
Could you try and back claims with some evidence?
Yes, in a perfect world where we know if a commit is a fix we could avoid introducing regressions into the stable trees. Heck, maybe we could even stop writing buggy code to begin with?
Second, the way you're identifying regression-introducing commits seems to be excluding one of the most common, maybe *the* most common, cause of AUTOSEL regressions: missing prerequisite commits. A very common case that I've seen repeatedly is AUTOSEL picking just patch 2 or higher of a multi-patch series. For an example, see the patch that started this thread... If a missing prerequisite is backported later, my understanding is that it usually isn't given a Fixes tag, as the upstream commit didn't have it. I think such regressions aren't counted in your statistic, which only looks at Fixes tags.
It definitely happens, but we usually end up dropping the AUTOSEL-ed commit rather than bringing in complex dependency chains.
Look at the stable-queue for a record of those.
(Of course, stable-tagged commits sometimes have missing prerequisite bugs too. But it's expected to be at a lower rate, since the original developers and maintainers are directly involved in adding the stable tags. These are the people who are more familiar than anyone else with prerequisites.)
You'd be surprised. There is documentation around how one would annotate dependencies for stable tagged commits, something along the lines of:
cc: stable@kernel.org # dep1 dep2
Grep through the git log and see how often this is actually used.
Third, the category "commits without a stable tag" doesn't include just AUTOSEL commits, but also non-AUTOSEL commits that people asked to be added to stable because they fixed a problem for them. Such commits often have been in mainline for a long time, so naturally they're expected to have a lower regression rate than stable-tagged commits due to the longer soak time, on average. So if the regression rate of stable-tagged and non-stable-tagged commits is actually similar, that suggests the regression rate of non-stable-tagged commits is being brought up artifically by a high regression rate in AUTOSEL commits...
Yes, the numbers are pretty skewed up by different aspects of the process.
So, I think your statistics actually reflect quite badly on AUTOSEL in its current form.
By the way, to be clear, AUTOSEL is absolutely needed. The way you are doing it currently is not working well, though. I think it needs to be tuned to select fewer, higher-confidence fixes, and you need to do some basic checks against each one, like "does this commit have a pending fix" and "is this commit part of
Keep in mind that there's some lag time between when we do our thing vs when things appear upstream.
Greg actually runs the "is there a pending fix" check even after I've pushed the patches, before he cuts releases.
a multi-patch series, and if so are earlier patches needed as prerequisites". There also needs to be more soak time in mainline, and more review time.
Tricky bit with mainline/review time is that very few of our users actually run -rc trees.
We end up hitting many of the regressions because the commits actually end up in stable trees. Should it work that way? No, but our testing story around -rc releases is quite lacking.
IMO you also need to take a hard look at whatever neural network thing you are using, as from what I've seen its results are quite poor... It does pick up some obvious fixes, but it seems they could have just as easily been found through some heuristics with grep. Beyond those obvious fixes, what it picks up seems to be barely distinguishable from a random selection.
I mean... patches welcome? Do you want to come up with a set of heuristics that performs better and give it a go? I'll happily switch over.
I'm not sure how feedback in the form of "this sucks but I'm sure it could be much better" is useful.
On Mon, Feb 27, 2023 at 08:53:31PM -0500, Sasha Levin wrote:
I'm shocked that these are the statistics you use to claim the current AUTOSEL process is working. I think they actually show quite the opposite!
First, since many AUTOSEL commits aren't actually fixes but nearly all stable-tagged commits *are* fixes, the rate of regressions per commit would need to be lower for AUTOSEL commits than for stable-tagged commits in order for AUTOSEL commits to have the same rate of regressions *per fix*. Your numbers suggest a similar regression rate *per commit*. Thus, AUTOSEL probably introduces more regressions *per fix* than stable-tagged commits.
Interesting claim. How many of the AUTOSEL commits are "actual" fixes? How do you know if a commit is a fix for anything or not?
Could you try and back claims with some evidence?
Yes, in a perfect world where we know if a commit is a fix we could avoid introducing regressions into the stable trees. Heck, maybe we could even stop writing buggy code to begin with?
Are you seriously trying to claim that a random commit your neural network picked up is just as likely to be a fix as a commit that the author explicitly tagged as a fix and/or for stable?
That's quite an extraordinary claim, and it's not true from my experience. Lots of AUTOSEL patches that get Cc'ed to me, if I'm familiar enough with the area to understand fairly well whether the patch is a "fix", are not actually fixes. Or are very borderline "fixes" that don't meet stable criteria. (Note, I generally only bother responding to AUTOSEL if I think a patch is actually going to cause a problem. So a lack of response isn't necessarily agreement that a patch is really suitable for stable...)
Oh sorry, personal experience is not "evidence". Please disregard my invalid non-evidence-based opinion.
(Of course, stable-tagged commits sometimes have missing prerequisite bugs too. But it's expected to be at a lower rate, since the original developers and maintainers are directly involved in adding the stable tags. These are the people who are more familiar than anyone else with prerequisites.)
You'd be surprised. There is documentation around how one would annotate dependencies for stable tagged commits, something along the lines of:
cc: stable@kernel.org # dep1 dep2
Grep through the git log and see how often this is actually used.
Well, probably more common is that prerequisites are in the same patchset, and the prerequisites are tagged for stable too. Whereas AUTOSEL often just picks patch X of N. Also, developers and maintainers who tag patches for stable are probably more likely to help with the stable process in general and make sure patches are backported correctly...
Anyway, the point is, AUTOSEL needs to be fixed to stop inappropriately cherry-picking patch X of N so often.
a multi-patch series, and if so are earlier patches needed as prerequisites". There also needs to be more soak time in mainline, and more review time.
Tricky bit with mainline/review time is that very few of our users actually run -rc trees.
We end up hitting many of the regressions because the commits actually end up in stable trees. Should it work that way? No, but our testing story around -rc releases is quite lacking.
Well, in the bug that affected me, it *was* found on mainline almost immediately. It just took a bit longer than the extremely aggressive 7-day AUTOSEL period to be fixed.
Oh sorry again, one example is not "evidence". Please disregard my invalid non-evidence-based opinion.
I'm not sure how feedback in the form of "this sucks but I'm sure it could be much better" is useful.
I've already given you some specific suggestions.
I can't force you to listen to them, of course.
- Eric
I'm not sure how feedback in the form of "this sucks but I'm sure it could be much better" is useful.
I've already given you some specific suggestions.
I can't force you to listen to them, of course.
Eric,
As you probably know, this is not the first time that the subject of the AUTOSEL process has been discussed. Here is one example from fsdevel with a few other suggestions [1].
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
xfs maintainers have done that many years ago. This choice has consequences though - for years, no xfs fixes were flowing into stable trees at all, because no one was doing the backport work. It is hard to imagine that LTS kernel users were more happy about this situation than they would be from occasional regressions, but who knows...
It has taken a long time until we found the resources and finally started a process of reviewing, testing and submitting xfs fixes to stable trees and this process involves a lot of resources (3 maintainers + $$$), so opting out of AUTOSEL is not a clear win.
I will pencil down yet another discussion on fs and stable process at LSFMM23 to update on the current status with xfs, but it is hard to believe that this time we will be able to make significant changes to the AUTOSEL process.
Thanks, Amir.
[1] https://lore.kernel.org/linux-fsdevel/20201204160227.GA577125@mit.edu/#t
On Tue, Feb 28, 2023 at 12:41:07PM +0200, Amir Goldstein wrote:
I'm not sure how feedback in the form of "this sucks but I'm sure it could be much better" is useful.
I've already given you some specific suggestions.
I can't force you to listen to them, of course.
Eric,
As you probably know, this is not the first time that the subject of the AUTOSEL process has been discussed. Here is one example from fsdevel with a few other suggestions [1].
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
Yes, and simply put, that's the answer for any subsystem or maintainer that does not want their patches picked using the AUTOSEL tool.
The problem that the AUTOSEL tool is solving is real, we have whole major subsystems where no patches are ever marked as "for stable" and so real bugfixes are never backported properly.
In an ideal world, all maintainers would properly mark their patches for stable backporting (as documented for the past 15+ years, with a cc: stable tag, NOT a Fixes: tag), but we do not live in that world, and hence, the need for the AUTOSEL work.
thanks,
greg k-h
On 2/28/23 06:28, Greg KH wrote:
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
Yes, and simply put, that's the answer for any subsystem or maintainer that does not want their patches picked using the AUTOSEL tool.
The problem that the AUTOSEL tool is solving is real, we have whole major subsystems where no patches are ever marked as "for stable" and so real bugfixes are never backported properly.
Yeah, I agree.
And I'm throwing this out here [after having time to think about it due to an internet outage], but, would Cc'ing the patch's relevant subsystems on AUTOSEL emails help? This was sort of mentioned in this email[1] from Eric, and I think it _could_ help? I don't know, just something that crossed my mind earlier.
In an ideal world, all maintainers would properly mark their patches for stable backporting (as documented for the past 15+ years, with a cc: stable tag, NOT a Fixes: tag), but we do not live in that world, and hence, the need for the AUTOSEL work.
(I wish we did... Oh well.)
[1] https://lore.kernel.org/stable/Y%2Fzswi91axMN8OsA@sol.localdomain/
-- Slade
On Tue, Feb 28, 2023 at 09:05:16PM -0500, Slade Watkins wrote:
On 2/28/23 06:28, Greg KH wrote:
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
Yes, and simply put, that's the answer for any subsystem or maintainer that does not want their patches picked using the AUTOSEL tool.
The problem that the AUTOSEL tool is solving is real, we have whole major subsystems where no patches are ever marked as "for stable" and so real bugfixes are never backported properly.
Yeah, I agree.
And I'm throwing this out here [after having time to think about it due to an internet outage], but, would Cc'ing the patch's relevant subsystems on AUTOSEL emails help? This was sort of mentioned in this email[1] from Eric, and I think it _could_ help? I don't know, just something that crossed my mind earlier.
AFAICT, that's already being done now, which is good. What I was talking about is that the subsystem lists aren't included on the *other* stable emails. Most importantly, the "FAILED: patch failed to apply to stable tree" emails.
- Eric
On Tue, Feb 28, 2023 at 09:13:56PM -0800, Eric Biggers wrote:
On Tue, Feb 28, 2023 at 09:05:16PM -0500, Slade Watkins wrote:
On 2/28/23 06:28, Greg KH wrote:
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
Yes, and simply put, that's the answer for any subsystem or maintainer that does not want their patches picked using the AUTOSEL tool.
The problem that the AUTOSEL tool is solving is real, we have whole major subsystems where no patches are ever marked as "for stable" and so real bugfixes are never backported properly.
Yeah, I agree.
And I'm throwing this out here [after having time to think about it due to an internet outage], but, would Cc'ing the patch's relevant subsystems on AUTOSEL emails help? This was sort of mentioned in this email[1] from Eric, and I think it _could_ help? I don't know, just something that crossed my mind earlier.
AFAICT, that's already being done now, which is good. What I was talking about is that the subsystem lists aren't included on the *other* stable emails. Most importantly, the "FAILED: patch failed to apply to stable tree" emails.
Why would the FAILED emails want to go to a mailing list? If the people that were part of making the patch don't want to respond to a FAILED email, why would anyone on the mailing list?
But hey, I'll be glad to take a change to my script to add that functionality if you want to make it, it's here: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/tree...
thanks,
greg k-h
On Wed, Mar 01, 2023 at 07:09:49AM +0100, Greg KH wrote:
Why would the FAILED emails want to go to a mailing list? If the people that were part of making the patch don't want to respond to a FAILED email, why would anyone on the mailing list?
The same reason we use mailing lists for kernel development at all instead of just sending patches directly to the MAINTAINERS.
But hey, I'll be glad to take a change to my script to add that functionality if you want to make it, it's here: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/tree...
Ah, the classic "patches welcome".
Unfortunately, I don't think that can work very well for scripts that are only ever used by at most two people -- you and Sasha. Many even just by you, apparently, as I see your name, email address, home directory, etc. hardcoded in the scripts. (BTW, where are the scripts Sasha uses for AUTOSEL?)
- Eric
On Tue, Feb 28, 2023 at 11:22:53PM -0800, Eric Biggers wrote:
On Wed, Mar 01, 2023 at 07:09:49AM +0100, Greg KH wrote:
Why would the FAILED emails want to go to a mailing list? If the people that were part of making the patch don't want to respond to a FAILED email, why would anyone on the mailing list?
The same reason we use mailing lists for kernel development at all instead of just sending patches directly to the MAINTAINERS.
I'm personally seeing a difference between reviewing patches on a mailing list to help someone polish it, and commenting on its suitability for stable once it's got merged, from someone not having participated to its inclusion. All such patches are already sent to stable@ which many of those of us interested in having a look are already subscribed to, and which most often just triggers a quick glance depending on areas of interest. I hardly see anyone just curious about a patch ask "are you sure ?".
I could possibly understand the value of sending the failed ones to a list, in case someone with more time available than the author wants to give it a try. But again they're already sent to stable@. It's just that it might be possible that more interested parties are on other lists. But again I'm not fully convinced.
But hey, I'll be glad to take a change to my script to add that functionality if you want to make it, it's here: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/tree...
Ah, the classic "patches welcome".
Unfortunately, I don't think that can work very well for scripts that are only ever used by at most two people -- you and Sasha. Many even just by you, apparently, as I see your name, email address, home directory, etc. hardcoded in the scripts.
But it's going into a dead end. You are the one saying that changes are easy, suggesting to use get_maintainers.pl, so easy that you can't try to adapt them in existing stuff. Even without modifying existing scripts, if you are really interested by such features, why not at least try to run your idea over a whole series, figure how long it takes, how accurate it seems to be, adjust the output to remove unwanted noise and propose for review a few lines that seem to do the job for you ?
(BTW, where are the scripts Sasha uses for AUTOSEL?)
Maybe they're even uglier and he doesn't want to show them. The ones I was using to sort out 3.10 patches were totally ugly and full of heuristics, with lines that I would comment/uncomment along operations to refine the selection and I definitely wasn't going to post that anywhere. They were looking a bit like a commented extract of my .history. I'm sure over time Sasha has a much cleaner and repeatable process but maybe it involves parts that are not stabilized, that he's not proud of, or even tools contributed by coworkers that we don't necessarily need to know about and where he hardly sees how anyone could bring any help.
Willy
On Wed, Mar 01, 2023 at 08:40:03AM +0100, Willy Tarreau wrote:
But it's going into a dead end. You are the one saying that changes are easy, suggesting to use get_maintainers.pl, so easy that you can't try to adapt them in existing stuff. Even without modifying existing scripts, if you are really interested by such features, why not at least try to run your idea over a whole series, figure how long it takes, how accurate it seems to be, adjust the output to remove unwanted noise and propose for review a few lines that seem to do the job for you ?
As I said, Sasha *already does this for AUTOSEL*. So it seems this problem has already been solved, but Sasha and Greg are not coordinating with each other.
Anyway, it's hard to have much motivation to try to contribute to scripts that not only can I not use or test myself, but before even getting to that point, pretty much any ideas for improvements are strongly pushed back on. Earlier I was actually going to go into more detail about some ideas for how to flag and review potential problems with backporting a commit, but I figured why bother given how this thread has been going.
(Also I presume anything that would add *any* human time on the stable maintainers' end per patch, on average, would be strongly pushed back on too? But I have no visibility into what would be acceptable. I don't do their job.)
- Eric
On Wed, Mar 01, 2023 at 12:31:22AM -0800, Eric Biggers wrote:
On Wed, Mar 01, 2023 at 08:40:03AM +0100, Willy Tarreau wrote:
But it's going into a dead end. You are the one saying that changes are easy, suggesting to use get_maintainers.pl, so easy that you can't try to adapt them in existing stuff. Even without modifying existing scripts, if you are really interested by such features, why not at least try to run your idea over a whole series, figure how long it takes, how accurate it seems to be, adjust the output to remove unwanted noise and propose for review a few lines that seem to do the job for you ?
As I said, Sasha *already does this for AUTOSEL*. So it seems this problem has already been solved, but Sasha and Greg are not coordinating with each other.
We do not share the same scripts for these tasks as we have different roles here. That's all, nothing malicious.
thanks,
greg k-h
On Tue, Feb 28, 2023 at 09:05:16PM -0500, Slade Watkins wrote:
On 2/28/23 06:28, Greg KH wrote:
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
Yes, and simply put, that's the answer for any subsystem or maintainer that does not want their patches picked using the AUTOSEL tool.
The problem that the AUTOSEL tool is solving is real, we have whole major subsystems where no patches are ever marked as "for stable" and so real bugfixes are never backported properly.
Yeah, I agree.
And I'm throwing this out here [after having time to think about it due to an internet outage], but, would Cc'ing the patch's relevant subsystems on AUTOSEL emails help? This was sort of mentioned in this email[1] from Eric, and I think it _could_ help? I don't know, just something that crossed my mind earlier.
I don't know, maybe? Note that determining a patch's "subsystem" at many times is difficult in an automated fashion, have any idea how to do that reliably that doesn't just hit lkml all the time?
But again, how is that going to help much, the people who should be saying "no" are the ones on the signed-off-by and cc: lines in the patch itself.
thanks,
greg k-h
On Wed, Mar 01, 2023 at 07:06:26AM +0100, Greg KH wrote:
On Tue, Feb 28, 2023 at 09:05:16PM -0500, Slade Watkins wrote:
On 2/28/23 06:28, Greg KH wrote:
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
Yes, and simply put, that's the answer for any subsystem or maintainer that does not want their patches picked using the AUTOSEL tool.
The problem that the AUTOSEL tool is solving is real, we have whole major subsystems where no patches are ever marked as "for stable" and so real bugfixes are never backported properly.
Yeah, I agree.
And I'm throwing this out here [after having time to think about it due to an internet outage], but, would Cc'ing the patch's relevant subsystems on AUTOSEL emails help? This was sort of mentioned in this email[1] from Eric, and I think it _could_ help? I don't know, just something that crossed my mind earlier.
I don't know, maybe? Note that determining a patch's "subsystem" at many times is difficult in an automated fashion, have any idea how to do that reliably that doesn't just hit lkml all the time?
As I said, it seems Sasha already does this for AUTOSEL (but not other stable emails). I assume he uses either get_maintainer.pl, or the lists the original patch is sent to (retrievable from lore). This is *not* a hard problem.
But again, how is that going to help much, the people who should be saying "no" are the ones on the signed-off-by and cc: lines in the patch itself.
So that if one person does not respond, other people can help.
You're basically arguing that mailing lists shouldn't exist at all...
- Eric
On 28.02.23 12:28, Greg KH wrote:
On Tue, Feb 28, 2023 at 12:41:07PM +0200, Amir Goldstein wrote:
I'm not sure how feedback in the form of "this sucks but I'm sure it could be much better" is useful.
I've already given you some specific suggestions. I can't force you to listen to them, of course.
As you probably know, this is not the first time that the subject of the AUTOSEL process has been discussed. Here is one example from fsdevel with a few other suggestions [1].
But just so you know, as a maintainer, you have the option to request that patches to your subsystem will not be selected by AUTOSEL and run your own process to select, test and submit fixes to stable trees.
[...] In an ideal world, all maintainers would properly mark their patches for stable backporting (as documented for the past 15+ years, with a cc: stable tag, NOT a Fixes: tag), but we do not live in that world, and hence, the need for the AUTOSEL work.
Well, we could do something to get a bit closer to the ideal world: teach checkpatch.pl to help developers do the right thing in the first place. That's what I'm trying to do right now to make them add Link: tags more often (https://git.kernel.org/torvalds/c/d7f1d71e5ef6 ), as my regression tracking efforts heavily rely on them. Shouldn't be too hard to add a simple check along the lines of "this change has a Fixes: tag; either CC stable or do <foo> to suppress this warning" (<foo> could be a "nostable" tag or something else that we'd need to agree on first).
In an ideal we'd maybe even have a "checkpatch bot" that looks at all patches posted and sends feedback to the list if it finds something to improve. Sure, some (a lot?) of what AUTOSEL does relies on data that is only available after a change was merged, but maybe some is available earlier already.
Ciao, Thorsten
On Tue, Feb 28, 2023 at 12:28:23PM +0100, Greg KH wrote:
In an ideal world, all maintainers would properly mark their patches for stable backporting (as documented for the past 15+ years, with a cc: stable tag, NOT a Fixes: tag), but we do not live in that world, and hence, the need for the AUTOSEL work.
Just as a datapoint here: I stopped making any effort to copy stable on things due to AUTOSEL, it's pulling back so much more than I would consider for stable that it no longer seems worthwhile to try to make decisions about what might fit.
On Mon, Feb 27, 2023 at 07:41:31PM -0800, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 08:53:31PM -0500, Sasha Levin wrote:
I'm shocked that these are the statistics you use to claim the current AUTOSEL process is working. I think they actually show quite the opposite!
First, since many AUTOSEL commits aren't actually fixes but nearly all stable-tagged commits *are* fixes, the rate of regressions per commit would need to be lower for AUTOSEL commits than for stable-tagged commits in order for AUTOSEL commits to have the same rate of regressions *per fix*. Your numbers suggest a similar regression rate *per commit*. Thus, AUTOSEL probably introduces more regressions *per fix* than stable-tagged commits.
Interesting claim. How many of the AUTOSEL commits are "actual" fixes? How do you know if a commit is a fix for anything or not?
Could you try and back claims with some evidence?
Yes, in a perfect world where we know if a commit is a fix we could avoid introducing regressions into the stable trees. Heck, maybe we could even stop writing buggy code to begin with?
Are you seriously trying to claim that a random commit your neural network picked up is just as likely to be a fix as a commit that the author explicitly tagged as a fix and/or for stable?
I'd like to think that this is the case after the initial selection and multiple rounds of reviews, yes.
That's quite an extraordinary claim, and it's not true from my experience. Lots of AUTOSEL patches that get Cc'ed to me, if I'm familiar enough with the area to understand fairly well whether the patch is a "fix", are not actually fixes. Or are very borderline "fixes" that don't meet stable criteria. (Note, I generally only bother responding to AUTOSEL if I think a patch is actually going to cause a problem. So a lack of response isn't necessarily agreement that a patch is really suitable for stable...)
Oh sorry, personal experience is not "evidence". Please disregard my invalid non-evidence-based opinion.
(Of course, stable-tagged commits sometimes have missing prerequisite bugs too. But it's expected to be at a lower rate, since the original developers and maintainers are directly involved in adding the stable tags. These are the people who are more familiar than anyone else with prerequisites.)
You'd be surprised. There is documentation around how one would annotate dependencies for stable tagged commits, something along the lines of:
cc: stable@kernel.org # dep1 dep2
Grep through the git log and see how often this is actually used.
Well, probably more common is that prerequisites are in the same patchset, and the prerequisites are tagged for stable too. Whereas AUTOSEL often just picks patch X of N. Also, developers and maintainers who tag patches for stable are probably more likely to help with the stable process in general and make sure patches are backported correctly...
Anyway, the point is, AUTOSEL needs to be fixed to stop inappropriately cherry-picking patch X of N so often.
That's a fair point.
a multi-patch series, and if so are earlier patches needed as prerequisites". There also needs to be more soak time in mainline, and more review time.
Tricky bit with mainline/review time is that very few of our users actually run -rc trees.
We end up hitting many of the regressions because the commits actually end up in stable trees. Should it work that way? No, but our testing story around -rc releases is quite lacking.
Well, in the bug that affected me, it *was* found on mainline almost immediately. It just took a bit longer than the extremely aggressive 7-day AUTOSEL period to be fixed.
Oh sorry again, one example is not "evidence". Please disregard my invalid non-evidence-based opinion.
I'm happy that we're in agreement that significant process changes can't happen because of opinions or anecdotal examples.
In all seriousness, I will work on addressing the issues that happened around the commit(s) you've pointed out and improve our existing process.
On Mon, Feb 27, 2023 at 07:41:31PM -0800, Eric Biggers wrote:
Well, probably more common is that prerequisites are in the same patchset, and the prerequisites are tagged for stable too. Whereas AUTOSEL often just picks patch X of N. Also, developers and maintainers who tag patches for stable are probably more likely to help with the stable process in general and make sure patches are backported correctly...
Anyway, the point is, AUTOSEL needs to be fixed to stop inappropriately cherry-picking patch X of N so often.
... and AUTOSEL strikes again, with the 6.1 and 6.2 kernels currently crashing whenever a block device is removed, due to patches 1 and 3 of a 3-patch series being AUTOSEL'ed (on the same day I started this discussion, no less):
https://lore.kernel.org/linux-block/CAOCAAm4reGhz400DSVrh0BetYD3Ljr2CZen7_3D...
Oh sorry, ignore this, it's just an anecdotal example.
- Eric
On Fri, Mar 10, 2023 at 03:07:04PM -0800, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 07:41:31PM -0800, Eric Biggers wrote:
Well, probably more common is that prerequisites are in the same patchset, and the prerequisites are tagged for stable too. Whereas AUTOSEL often just picks patch X of N. Also, developers and maintainers who tag patches for stable are probably more likely to help with the stable process in general and make sure patches are backported correctly...
Anyway, the point is, AUTOSEL needs to be fixed to stop inappropriately cherry-picking patch X of N so often.
... and AUTOSEL strikes again, with the 6.1 and 6.2 kernels currently crashing whenever a block device is removed, due to patches 1 and 3 of a 3-patch series being AUTOSEL'ed (on the same day I started this discussion, no less):
https://lore.kernel.org/linux-block/CAOCAAm4reGhz400DSVrh0BetYD3Ljr2CZen7_3D...
Oh sorry, ignore this, it's just an anecdotal example.
Yes, clearly a problem with AUTOSEL and not with how sad the testing story is for stable releases.
On Sat, 2023-03-11 at 08:41 -0500, Sasha Levin wrote:
On Fri, Mar 10, 2023 at 03:07:04PM -0800, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 07:41:31PM -0800, Eric Biggers wrote:
Well, probably more common is that prerequisites are in the same patchset, and the prerequisites are tagged for stable too. Whereas AUTOSEL often just picks patch X of N. Also, developers and maintainers who tag patches for stable are probably more likely to help with the stable process in general and make sure patches are backported correctly...
Anyway, the point is, AUTOSEL needs to be fixed to stop inappropriately cherry-picking patch X of N so often.
... and AUTOSEL strikes again, with the 6.1 and 6.2 kernels currently crashing whenever a block device is removed, due to patches 1 and 3 of a 3-patch series being AUTOSEL'ed (on the same day I started this discussion, no less):
https://lore.kernel.org/linux-block/CAOCAAm4reGhz400DSVrh0BetYD3Ljr2CZen7_3D...
Oh sorry, ignore this, it's just an anecdotal example.
Yes, clearly a problem with AUTOSEL and not with how sad the testing story is for stable releases.
Hey, that's a completely circular argument: if we had perfect testing then, of course, it would pick up every bad patch before anything got released; but we don't, and everyone knows it. Therefore, we have to be discriminating about what patches we put in. And we have to acknowledge that zero bugs in patches isn't feasible in spite of all the checking we do do. I also think we have to acknowledge that users play a role in the testing process because some bugs simply aren't picked up until they try out a release. So discouraging users from running mainline -rc's means we do get bugs in the released kernel that we might not have had if they did. Likewise if everyone only runs stable kernels, the bugs in the released kernel don't get found until stable. So this blame game really isn't helping.
I think the one thing everyone on this thread might agree on is that this bug wouldn't have happened if AUTOSEL could detect and backport series instead of individual patches. Sasha says that can't be done based on in information in Linus' tree[1] which is true but not a correct statement of the problem. The correct question is given all the information available, including lore, could we assist AUTOSEL in better detecting series and possibly making better decisions generally?
I think that's the challenge for anyone who actually wants to help rather than complain. At least the series detection bit sounds like it could be a reasonable summer of code project.
Regards,
James
[1] https://lore.kernel.org/linux-fsdevel/ZAyK0KM6JmVOvQWy@sashalap/
On Sat, Mar 11, 2023 at 10:54:36AM -0500, James Bottomley wrote:
On Sat, 2023-03-11 at 08:41 -0500, Sasha Levin wrote:
On Fri, Mar 10, 2023 at 03:07:04PM -0800, Eric Biggers wrote:
On Mon, Feb 27, 2023 at 07:41:31PM -0800, Eric Biggers wrote:
Well, probably more common is that prerequisites are in the same patchset, and the prerequisites are tagged for stable too. Whereas AUTOSEL often just picks patch X of N. Also, developers and maintainers who tag patches for stable are probably more likely to help with the stable process in general and make sure patches are backported correctly...
Anyway, the point is, AUTOSEL needs to be fixed to stop inappropriately cherry-picking patch X of N so often.
... and AUTOSEL strikes again, with the 6.1 and 6.2 kernels currently crashing whenever a block device is removed, due to patches 1 and 3 of a 3-patch series being AUTOSEL'ed (on the same day I started this discussion, no less):
https://lore.kernel.org/linux-block/CAOCAAm4reGhz400DSVrh0BetYD3Ljr2CZen7_3D...
Oh sorry, ignore this, it's just an anecdotal example.
Yes, clearly a problem with AUTOSEL and not with how sad the testing story is for stable releases.
Hey, that's a completely circular argument: if we had perfect testing then, of course, it would pick up every bad patch before anything got released; but we don't, and everyone knows it. Therefore, we have to be discriminating about what patches we put in. And we have to acknowledge that zero bugs in patches isn't feasible in spite of all the checking we do do. I also think we have to acknowledge that users play a role in the testing process because some bugs simply aren't picked up until they try out a release. So discouraging users from running mainline -rc's means we do get bugs in the released kernel that we might not have had if they did. Likewise if everyone only runs stable kernels, the bugs in the released kernel don't get found until stable. So this blame game really isn't helping.
I think the one thing everyone on this thread might agree on is that this bug wouldn't have happened if AUTOSEL could detect and backport series instead of individual patches. Sasha says that can't be done based on in information in Linus' tree[1] which is true but not a correct statement of the problem. The correct question is given all the information available, including lore, could we assist AUTOSEL in better detecting series and possibly making better decisions generally?
My argument was that this type of issue is no AUTOSEL specific, and we saw it happening multiple times with stable tagged patches as well.
It's something that needs to get solved, and I suspect that both Greg and myself will end up using it when it's there.
I think that's the challenge for anyone who actually wants to help rather than complain. At least the series detection bit sounds like it could be a reasonable summer of code project.
Right - I was trying to reply directly to Willy's question: this is something very useful, somewhat hard, and I don't think I could do in the near future - so help is welcome here.
On Sat, Mar 11, 2023 at 01:07:09PM -0500, Sasha Levin wrote:
I think the one thing everyone on this thread might agree on is that this bug wouldn't have happened if AUTOSEL could detect and backport series instead of individual patches. Sasha says that can't be done based on in information in Linus' tree[1] which is true but not a correct statement of the problem. The correct question is given all the information available, including lore, could we assist AUTOSEL in better detecting series and possibly making better decisions generally?
My argument was that this type of issue is no AUTOSEL specific, and we saw it happening multiple times with stable tagged patches as well.
I suspect that it happens *less* with Greg's patches, since the people who add (and it's almost always remove) the Cc: tags have the dependency information fresh in their brains' caches, and are more likely to correctly tag which patches should and shouldn't have Cc: stable tags.
Now, if after I've carefully annotated a patch series, some with Cc: stable tags, and some without, and AUTOSEL were to override my work and take some extra patches, that I had deliberately not tagged with Cc: stable, I can (and have) gotten mildly annoyed, but in general, it means that something which probably shouldn't and didn't need to go into an LTS kernel ended up going into an LTS kernel, and TBH, when this has happened, I don't worry about it *too* much. It probably has made LTS less sstable when this happens, but it's generally not a disaster.
In any case, I think the problem of missing dependencies when they are in a patch series is *primarily* an AUTOSEL problem, and as I said, probably can be fixed by using the information in lore.
I'll note that we can probably make the "was this part of the patch series" work even better by encouraging contributors not to take unrelated patches and put them unnecessarily into a patch series. So if I knew that the stable bots were taking patch series into account, or were about to start taking advantage of this signal, I'd be happy to push back on contributors to break up patch series that should be glommed together.
- Ted
Hi!
So to summarize, that buggy commit was backported even though:
- There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place.
- On the AUTOSEL thread, someone told you the commit is broken.
- There was already a thread that reported a regression caused by the commit. Easily findable via lore search.
- There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
I see this problem, too, "-stable" is more experimental than Linus's releases.
I believe that -stable would be more useful without AUTOSEL process.
Best regards, Pavel
On Tue, Mar 07, 2023 at 10:18:35PM +0100, Pavel Machek wrote:
Hi!
So to summarize, that buggy commit was backported even though:
- There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place.
- On the AUTOSEL thread, someone told you the commit is broken.
- There was already a thread that reported a regression caused by the commit. Easily findable via lore search.
- There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
BTW, another cause of this is that the commit (66f99628eb24) was AUTOSEL'd after only being in mainline for 4 days, and *released* in all LTS kernels after only being in mainline for 12 days. Surely that's a timeline befitting a critical security vulnerability, not some random neural-network-selected commit that wasn't even fixing anything?
I see this problem, too, "-stable" is more experimental than Linus's releases.
I believe that -stable would be more useful without AUTOSEL process.
There has to be a way to ensure that security fixes that weren't properly tagged make it to stable anyway. So, AUTOSEL is necessary, at least in some form. I think that debating *whether it should exist* is a distraction from what's actually important, which is that the current AUTOSEL process has some specific problems, and these specific problems need to be fixed...
- Eric
On Tue, Mar 07, 2023 at 09:45:24PM +0000, Eric Biggers wrote:
On Tue, Mar 07, 2023 at 10:18:35PM +0100, Pavel Machek wrote:
I believe that -stable would be more useful without AUTOSEL process.
There has to be a way to ensure that security fixes that weren't properly tagged make it to stable anyway. So, AUTOSEL is necessary, at least in some form. I think that debating *whether it should exist* is a distraction from what's actually important, which is that the current AUTOSEL process has some specific problems, and these specific problems need to be fixed...
I agree with you, that we need autosel and we also need autosel to be better. I actually see Pavel's mail as a datapoint (or "anecdote", if you will) in support of that; the autosel process currently works so badly that a long-time contributor thinks it's worse than nothing.
Sasha, what do you need to help you make this better?
On Sat, Mar 11, 2023 at 06:25:59AM +0000, Matthew Wilcox wrote:
On Tue, Mar 07, 2023 at 09:45:24PM +0000, Eric Biggers wrote:
On Tue, Mar 07, 2023 at 10:18:35PM +0100, Pavel Machek wrote:
I believe that -stable would be more useful without AUTOSEL process.
There has to be a way to ensure that security fixes that weren't properly tagged make it to stable anyway. So, AUTOSEL is necessary, at least in some form. I think that debating *whether it should exist* is a distraction from what's actually important, which is that the current AUTOSEL process has some specific problems, and these specific problems need to be fixed...
I agree with you, that we need autosel and we also need autosel to be better. I actually see Pavel's mail as a datapoint (or "anecdote", if you will) in support of that; the autosel process currently works so badly that a long-time contributor thinks it's worse than nothing.
Sasha, what do you need to help you make this better?
One would probably need to define "better" and "so badly". As a user of -stable kernels, I consider that they've got much better over the last years. A lot of processes have improved everywhere even before the release, but I do think that autosel is part of what generally gives a chance to some useful and desired fixed (e.g. in drivers) to be backported and save some users unneeded headaches.
In fact I think that the reason for the negative perception is that patches that it picks are visible, and it's easy to think "WTF" when seeing one of them. Previously, these patches were not proposed, so nobody knew they were missing. It happened to plenty of us to spend some time trying to spot why a stable kernel would occasionally fail on a machine, and discovering in the process that mainline did work because it contained a fix that was never backported. This is frustrating but there's noone to blame for failing to pick that patch (and the patch's author should not be blamed either since for small compatibility stuff it's probably common to see first-timers who are not yet at ease with the process).
Here the patches are CCed to their authors before being merged. They get a chance to be reviewed and rejected. Granted, maybe sometimes they could be subject to a longer delay or be sent to certain lists. Maybe. But I do think that the complaints in fact reflect a process that's not as broken as some think, precisely because it allows people to complain when something is going wrong. The previous process didn't permit that. For this alone it's a progress.
Willy
Hi!
I believe that -stable would be more useful without AUTOSEL process.
There has to be a way to ensure that security fixes that weren't properly tagged make it to stable anyway. So, AUTOSEL is necessary, at least in some form. I think that debating *whether it should exist* is a distraction from what's actually important, which is that the current AUTOSEL process has some specific problems, and these specific problems need to be fixed...
I agree with you, that we need autosel and we also need autosel to be better. I actually see Pavel's mail as a datapoint (or "anecdote", if you will) in support of that; the autosel process currently works so badly that a long-time contributor thinks it's worse than nothing.
Sasha, what do you need to help you make this better?
One would probably need to define "better" and "so badly". As a user of -stable kernels, I consider that they've got much better over the
Well, we have Documentation/process/stable-kernel-rules.rst . If we wanted to define "better", we should start documenting what the real rules are for the patches in the stable tree.
I agree that -stable works quite well, but the real rules are far away from what is documented.
I don't think AUTOSEL works well. I believe we should require positive reply from patch author on relevant maintainer before merging such patch to -stable.
Best regards, Pavel
On Sat, Mar 11, 2023 at 12:45:20PM +0100, Pavel Machek wrote:
Hi!
I believe that -stable would be more useful without AUTOSEL process.
There has to be a way to ensure that security fixes that weren't properly tagged make it to stable anyway. So, AUTOSEL is necessary, at least in some form. I think that debating *whether it should exist* is a distraction from what's actually important, which is that the current AUTOSEL process has some specific problems, and these specific problems need to be fixed...
I agree with you, that we need autosel and we also need autosel to be better. I actually see Pavel's mail as a datapoint (or "anecdote", if you will) in support of that; the autosel process currently works so badly that a long-time contributor thinks it's worse than nothing.
Sasha, what do you need to help you make this better?
One would probably need to define "better" and "so badly". As a user of -stable kernels, I consider that they've got much better over the
Well, we have Documentation/process/stable-kernel-rules.rst . If we wanted to define "better", we should start documenting what the real rules are for the patches in the stable tree.
I agree that -stable works quite well, but the real rules are far away from what is documented.
I don't think AUTOSEL works well. I believe we should require positive reply from patch author on relevant maintainer before merging such patch to -stable.
Again, for the people in the back so that everyone can hear it, that does not work as some subsystems refuse to tag ANY patches for stable commits, nor do they want to have anything to do with stable kernels at all. And that's fine, that's their option, but because of that, we have to have a way to actually get the real fixes in those subsystems to the users who use these stable kernels. Hence, the AUTOSEL work.
So no, forcing a maintainer/author to ack a patch to get it into stable is not going to work UNLESS a maintainer/author explicitly asks for that, which some have, and that's wonderful.
thanks,
greg k-h
On Sat, 11 Mar 2023, Greg KH wrote:
So no, forcing a maintainer/author to ack a patch to get it into stable is not going to work UNLESS a maintainer/author explicitly asks for that, which some have, and that's wonderful.
FWIW I'm happy to ack patches for stable backporting (and I tag patches of mine for stable as I deem appropriate).
Maciej
On Sat, Mar 11, 2023 at 06:25:59AM +0000, Matthew Wilcox wrote:
On Tue, Mar 07, 2023 at 09:45:24PM +0000, Eric Biggers wrote:
On Tue, Mar 07, 2023 at 10:18:35PM +0100, Pavel Machek wrote:
I believe that -stable would be more useful without AUTOSEL process.
There has to be a way to ensure that security fixes that weren't properly tagged make it to stable anyway. So, AUTOSEL is necessary, at least in some form. I think that debating *whether it should exist* is a distraction from what's actually important, which is that the current AUTOSEL process has some specific problems, and these specific problems need to be fixed...
I agree with you, that we need autosel and we also need autosel to be better. I actually see Pavel's mail as a datapoint (or "anecdote", if you will) in support of that; the autosel process currently works so badly that a long-time contributor thinks it's worse than nothing.
Sasha, what do you need to help you make this better?
What could I do to avoid this?
I suppose that if I had a way to know if a certain a commit is part of a series, I could either take all of it or none of it, but I don't think I have a way of doing that by looking at a commit in Linus' tree (suggestions welcome, I'm happy to implement them).
Other than that, the commit at hand:
1. Describes a real problem that needs to be fixed, so while it was reverted for a quick fix, we'll need to go back and bring it in along with it's dependency.
2. Soaked for over two weeks between the AUTOSEL mails and the release, gone through multiple rounds of reviews.
3. Went through all the tests provided by all the individuals, bots, companies, etc who test the tree through multiple rounds of testing (we had to do a -rc2 for that releases).
4. Went through whatever tests distros run on the kernel before they package and release it.
On Sat, Mar 11, 2023 at 09:06:08AM -0500, Sasha Levin wrote:
I suppose that if I had a way to know if a certain a commit is part of a series, I could either take all of it or none of it, but I don't think I have a way of doing that by looking at a commit in Linus' tree (suggestions welcome, I'm happy to implement them).
Well, this is why I think it is a good idea to have a link to the patch series in lore. I know Linus doesn't like it, claiming it doesn't add any value, but I have to disagree. It adds two bits of value.
First, if there is any discussion on the review of the patch before it goes in, the lore link gives you access to that --- and if people have a back-lick in the cover letter of version N to the cover letter of version N-1, it allows someone doing code archeology to find all of the discussions around the patch series in the lore archives.
Secondly, the lore link will allow you to figure out whether or not the patch is part of a series; b4 can figure this out by looking at the in-reply-to headers, and lore will chain the patch series together, so if the commit contains a lore link to the patch, the AUTOSEL script could use that to find out whether the patch is part of the series.
And this is really easy to do. All you need is the following in .git/hooks/applypatch-msg:
#!/bin/sh # For .git/hooks/applypatch-msg # . git-sh-setup perl -pi -e 's|^Message-Id:\s*<?([^>]+)>?$|Link: https://lore.kernel.org/r/%241%7Cg;' "$1" test -x "$GIT_DIR/hooks/commit-msg" && exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"} :
Cheers,
- Ted
P.S. There was a recent patch series where I noticed that I would be screwed if AUTOSEL would only take patch 2/2 and not patch 1/2. I dealt with that though by adding an explicit "Cc: stable@kernel.org". So that's the other way to avoid breakage; if people were universally careful about adding "Cc: stable@kernel.org" tags, then we wouldn't need AUTOSEL at all.
And this is another place where I break with commonly received wisdom about "Thou Shalt Never, Never Rewind The Git Branch". Personally, if I find that I missed a Cc: stable tag, rewinding the branch to add edit the trailers is *far* better a tradeoff than adhering to some religious rule about never rewinding git branches. Of course, I can get away with that since I don't have people basing their branches on my branch. But I've seen people who will self-righteously proclaim non-rewinding git branches as the One True Way to do git, and I profoundly disagree with that point of view.
On Sat, Mar 11, 2023 at 11:16:44AM -0500, Theodore Ts'o wrote:
On Sat, Mar 11, 2023 at 09:06:08AM -0500, Sasha Levin wrote:
I suppose that if I had a way to know if a certain a commit is part of a series, I could either take all of it or none of it, but I don't think I have a way of doing that by looking at a commit in Linus' tree (suggestions welcome, I'm happy to implement them).
Well, this is why I think it is a good idea to have a link to the patch series in lore. I know Linus doesn't like it, claiming it doesn't add any value, but I have to disagree. It adds two bits of value.
So, earlier I was going to go into more detail about some of my ideas, before Sasha and Greg started stonewalling with "patches welcome" (i.e. "I'm refusing to do my job") and various silly arguments about why nothing should be changed. But I suppose the worst thing that can happen is that that just continues, so here it goes:
One of the first things I would do if I was maintaining the stable kernels is to set up a way to automatically run searches on the mailing lists, and then take advantage of that in the stable process in various ways. Not having that is the root cause of a lot of the issues with the current process, IMO.
Now that lore exists, this might be trivial: it could be done just by hammering lore.kernel.org with queries https://lore.kernel.org/linux-fsdevel/?q=query from a Python script.
Of course, there's a chance that won't scale to multiple queries for each one of thousands of stable commits, or at least won't be friendly to the kernel.org admins. In that case, what can be done is to download down all emails from all lists, using lore's git mirrors or Atom feeds, and index them locally. (Note: if the complete history is inconveniently large, then just indexing the last year or so would work nearly as well.)
Then once that is in place, that could be used in various ways. For example, given a git commit, it's possible to search by email subject to get to the original patch, *even if the git commit does not have a Link tag*. And it can be automatically checked whether it's part of a patch series, and if so, whether all the patches in the series are being backported or just some.
This could also be used to check for mentions of a commit on the mailing list that potentially indicate a regression report, which is one of the issues we discussed earlier. I'm not sure what the optimal search criteria would be, but one option would be something like "messages that contain the commit title or commit ID and are dated to after the commit being committed". There might need to be some exclusions added to that.
This could also be used to automatically find the AUTOSEL email, if one exists, and check whether it's been replied to or not.
The purpose of all these mailing list searches would be to generate a list of potential issues with backporting each commit, which would then undergo brief human review. Once issues are reviewed, that state would be persisted, so that if the script gets run again, it would only show *new* information based on new mailing list emails that have not already been reviewed. That's needed because these issues need to be checked for when the patch is initially proposed for stable as well as slightly later, before the actual release happens.
If the stable maintainers have no time for doing *any* human review themselves (again, I do not know what their requirements are on how much time they can spend per patch), then instead an email with the list of potential issues could be generated and sent to stable@vger.kernel.org for review by others.
Anyway, that's my idea. I know the response will be either "that won't work" or "patches welcome", or a mix of both, but that's it.
- Eric
On Sat, Mar 11, 2023 at 09:48:13AM -0800, Eric Biggers wrote:
On Sat, Mar 11, 2023 at 11:16:44AM -0500, Theodore Ts'o wrote:
On Sat, Mar 11, 2023 at 09:06:08AM -0500, Sasha Levin wrote:
I suppose that if I had a way to know if a certain a commit is part of a series, I could either take all of it or none of it, but I don't think I have a way of doing that by looking at a commit in Linus' tree (suggestions welcome, I'm happy to implement them).
Well, this is why I think it is a good idea to have a link to the patch series in lore. I know Linus doesn't like it, claiming it doesn't add any value, but I have to disagree. It adds two bits of value.
So, earlier I was going to go into more detail about some of my ideas, before Sasha and Greg started stonewalling with "patches welcome" (i.e. "I'm refusing to do my job") and various silly arguments about why nothing should be changed. But I suppose the worst thing that can happen is that that just continues, so here it goes:
"job"? do you think I'm paid to do this work? Why would I stonewall improvements to the process?
I'm getting a bunch of suggestions and complaints that I'm not implementing those suggestions fast enough on my spare time.
One of the first things I would do if I was maintaining the stable kernels is to set up a way to automatically run searches on the mailing lists, and then take advantage of that in the stable process in various ways. Not having that is the root cause of a lot of the issues with the current process, IMO.
"if I was maintaining the stable kernels" - why is this rellevant? give us the tool you've proposed below and we'll be happy to use it. Heck, don't give it to us, use it to review the patches we're sending out for review and let us know if we've missed anything.
Now that lore exists, this might be trivial: it could be done just by hammering lore.kernel.org with queries https://lore.kernel.org/linux-fsdevel/?q=query from a Python script.
Of course, there's a chance that won't scale to multiple queries for each one of thousands of stable commits, or at least won't be friendly to the kernel.org admins. In that case, what can be done is to download down all emails from all lists, using lore's git mirrors or Atom feeds, and index them locally. (Note: if the complete history is inconveniently large, then just indexing the last year or so would work nearly as well.)
Then once that is in place, that could be used in various ways. For example, given a git commit, it's possible to search by email subject to get to the original patch, *even if the git commit does not have a Link tag*. And it can be automatically checked whether it's part of a patch series, and if so, whether all the patches in the series are being backported or just some.
This could also be used to check for mentions of a commit on the mailing list that potentially indicate a regression report, which is one of the issues we discussed earlier. I'm not sure what the optimal search criteria would be, but one option would be something like "messages that contain the commit title or commit ID and are dated to after the commit being committed". There might need to be some exclusions added to that.
This could also be used to automatically find the AUTOSEL email, if one exists, and check whether it's been replied to or not.
The purpose of all these mailing list searches would be to generate a list of potential issues with backporting each commit, which would then undergo brief human review. Once issues are reviewed, that state would be persisted, so that if the script gets run again, it would only show *new* information based on new mailing list emails that have not already been reviewed. That's needed because these issues need to be checked for when the patch is initially proposed for stable as well as slightly later, before the actual release happens.
If the stable maintainers have no time for doing *any* human review themselves (again, I do not know what their requirements are on how much time they can spend per patch), then instead an email with the list of potential issues could be generated and sent to stable@vger.kernel.org for review by others.
Anyway, that's my idea. I know the response will be either "that won't work" or "patches welcome", or a mix of both, but that's it.
I've been playing with this in the past - I had a bot that looks at the mailing lists for patches that are tagged for stable, and attempts to apply/build then on the multiple trees to verify that it works and send a reply back if something goes wrong, asking for a backport.
It gets a bit tricky as there's no way to go back from a commit to the initial submission, you start hitting issues like:
- Patches get re-sent multiple times (think stuff like tip trees, reviews from other maintainers, etc). - Different versions of patches - for example, v1 was a single patch and in v2 it became multiple patches.
I'm not arguing against your idea, I'm just saying that it's not trivial. An incomplete work here simply won't scale to the thousands of patches that flow in the trees, and won't be as useful. I don't think that this is trivial as you suggest.
If you disagree, and really think it's trivial, take 5 minutes to write something up? please?
On Sat, Mar 11, 2023 at 01:26:57PM -0500, Sasha Levin wrote:
"job"? do you think I'm paid to do this work?
Why would I stonewall improvements to the process?
I'm getting a bunch of suggestions and complaints that I'm not implementing those suggestions fast enough on my spare time.
One of the first things I would do if I was maintaining the stable kernels is to set up a way to automatically run searches on the mailing lists, and then take advantage of that in the stable process in various ways. Not having that is the root cause of a lot of the issues with the current process, IMO.
"if I was maintaining the stable kernels" - why is this rellevant? give us the tool you've proposed below and we'll be happy to use it. Heck, don't give it to us, use it to review the patches we're sending out for review and let us know if we've missed anything.
It's kind of a stretch to claim that maintaining the stable kernels is not part of your and Greg's jobs. But anyway, the real problem is that it's currently very hard for others to contribute, given the unique role the stable maintainers have and the lack of documentation about it. Each of the two maintainers has their own scripts, and it is not clear how they use them and what processes they follow. (Even just stable-kernel-rules.rst is totally incorrect these days.) Actually I still don't even know where your scripts are! They are not in stable-queue/scripts, it seems those are only Greg's scripts? And if I built something, how do I know you would even use it? You likely have all sorts of requirements that I don't even know about.
I've been playing with this in the past - I had a bot that looks at the mailing lists for patches that are tagged for stable, and attempts to apply/build then on the multiple trees to verify that it works and send a reply back if something goes wrong, asking for a backport.
It gets a bit tricky as there's no way to go back from a commit to the initial submission, you start hitting issues like:
- Patches get re-sent multiple times (think stuff like tip trees,
reviews from other maintainers, etc).
- Different versions of patches - for example, v1 was a single patch
and in v2 it became multiple patches.
I'm not arguing against your idea, I'm just saying that it's not trivial. An incomplete work here simply won't scale to the thousands of patches that flow in the trees, and won't be as useful. I don't think that this is trivial as you suggest.
There are obviously going to be edge cases; another one is commits that show up in git without ever having been sent to the mailing list. I don't think they actually matter very much, though. Worst case, we miss some things, but still find everything else.
If you disagree, and really think it's trivial, take 5 minutes to write something up? please?
I never said that it's "trivial" or that it would take only 5 minutes; that's just silly. Just that this is possible and it's what needs to be done.
If you don't have time, you should instead be helping ensure that the work gets done by someone else (internship, GSoC project, etc.).
And yes, I am interested in contributing, but as I mentioned I think you need to first acknowledge that there is a problem, fix your attitude of immediately pushing back on everything, and make it easier for people to contribute.
- Eric
On Sat, Mar 11, 2023 at 10:55:01AM -0800, Eric Biggers wrote:
If you disagree, and really think it's trivial, take 5 minutes to write something up? please?
I never said that it's "trivial" or that it would take only 5 minutes; that's just silly. Just that this is possible and it's what needs to be done.
(I did say that it would be trivial to query lore.kernel.org from a Python script, which is absolutely correct. I did *not* say that the whole thing, including the local index if querying lore.kernel.org directly is not scalable enough, would be trivial. Just that it's *possible*, and there do not seem to be any technical blockers...)
- Eric
On Sat, Mar 11, 2023 at 10:54:59AM -0800, Eric Biggers wrote:
On Sat, Mar 11, 2023 at 01:26:57PM -0500, Sasha Levin wrote:
"job"? do you think I'm paid to do this work?
Why would I stonewall improvements to the process?
I'm getting a bunch of suggestions and complaints that I'm not implementing those suggestions fast enough on my spare time.
One of the first things I would do if I was maintaining the stable kernels is to set up a way to automatically run searches on the mailing lists, and then take advantage of that in the stable process in various ways. Not having that is the root cause of a lot of the issues with the current process, IMO.
"if I was maintaining the stable kernels" - why is this rellevant? give us the tool you've proposed below and we'll be happy to use it. Heck, don't give it to us, use it to review the patches we're sending out for review and let us know if we've missed anything.
It's kind of a stretch to claim that maintaining the stable kernels is not part of your and Greg's jobs. But anyway, the real problem is that it's currently very hard for others to contribute, given the unique role the stable maintainers have and the lack of documentation about it. Each of the two maintainers has their own scripts, and it is not clear how they use them and what processes they follow. (Even just stable-kernel-rules.rst is totally incorrect these days.) Actually I still don't even know where your scripts are! They are not in stable-queue/scripts, it seems those are only Greg's scripts? And if I built something, how do I know you would even use it? You likely have all sorts of requirements that I don't even know about.
https://kernel.googlesource.com/pub/scm/linux/kernel/git/sashal/stable-tools...
I've last updated it about two years ago, but really it's not out of date - it just doesn't get that many changes at this point.
This is a mess we want to solve too: having a single repository with tools for "maintaining stable kernel trees" would be ideal and awesome, but it's quite the lift.
We ended up with different scripts because we started trying to solve different issues, and ended up converging into the same tree: even now, each of us handles different subsection of commits going into the kernel tree, we just end up pushing them into the same stable branch at the end.
I've been playing with this in the past - I had a bot that looks at the mailing lists for patches that are tagged for stable, and attempts to apply/build then on the multiple trees to verify that it works and send a reply back if something goes wrong, asking for a backport.
It gets a bit tricky as there's no way to go back from a commit to the initial submission, you start hitting issues like:
- Patches get re-sent multiple times (think stuff like tip trees,
reviews from other maintainers, etc).
- Different versions of patches - for example, v1 was a single patch
and in v2 it became multiple patches.
I'm not arguing against your idea, I'm just saying that it's not trivial. An incomplete work here simply won't scale to the thousands of patches that flow in the trees, and won't be as useful. I don't think that this is trivial as you suggest.
There are obviously going to be edge cases; another one is commits that show up in git without ever having been sent to the mailing list. I don't think they actually matter very much, though. Worst case, we miss some things, but still find everything else.
Consider the opposite, which I just saw earlier today with a commit that was tagged for stable: https://lore.kernel.org/all/20230217022200.3092987-1-yukuai1@huaweicloud.com...
Here, commit 1/2 reverts a previously broken fix, and is not marked for stable. Commit 2/2 applies the proper fix, but won't apply cleanly or correctly unless you have patch 1/2.
In this case you need both commits in the series, rather than none of them, otherwise you leave the trees broken.
If you disagree, and really think it's trivial, take 5 minutes to write something up? please?
I never said that it's "trivial" or that it would take only 5 minutes; that's just silly. Just that this is possible and it's what needs to be done.
If you don't have time, you should instead be helping ensure that the work gets done by someone else (internship, GSoC project, etc.).
My personal experience with this approach was that:
1. It ends up being more effort mentoring someone who is unfamailiar with this work rather than doing it myself.
2. There are very *very* few people who want to be doing this: to begin with the kernel is one of the less popular areas to get into, and on top of that the stable tree work is even worse because you do "maintenance" rather than write new shiny features.
And yes, I am interested in contributing, but as I mentioned I think you need to first acknowledge that there is a problem, fix your attitude of immediately pushing back on everything, and make it easier for people to contribute.
I don't think we disagree that the process is broken: this is one of the reasons we went away from trying to support 6 year LTS kernels.
However, we are not pushing back on ideas, we are asking for a hand in improving the process: we've been getting drive-by comments quite often, but when it comes to be doing the actual work people are quite reluctant to help.
If you want to sit down and scope out initial set of work around tooling to help here I'm more than happy to do that: I'm planning to be both in OSS and LPC if you want to do it in person, along with anyone else interested in helping out.
On Sat, Mar 11, 2023 at 11:25 PM Sasha Levin sashal@kernel.org wrote:
On Sat, Mar 11, 2023 at 10:54:59AM -0800, Eric Biggers wrote:
...
And yes, I am interested in contributing, but as I mentioned I think you need to first acknowledge that there is a problem, fix your attitude of immediately pushing back on everything, and make it easier for people to contribute.
I don't think we disagree that the process is broken: this is one of the reasons we went away from trying to support 6 year LTS kernels.
However, we are not pushing back on ideas, we are asking for a hand in improving the process: we've been getting drive-by comments quite often, but when it comes to be doing the actual work people are quite reluctant to help.
If you want to sit down and scope out initial set of work around tooling to help here I'm more than happy to do that: I'm planning to be both in OSS and LPC if you want to do it in person, along with anyone else interested in helping out.
Sasha,
Will you be able to attend a session on AUTOSEL on the overlap day of LSFMM and OSS (May 10) or earlier?
We were going to discuss the topic of filesystems and stable trees [1] anyway and I believe the discussion can be even more productive with you around.
I realize that the scope of AUTOSEL is wider than backporting filesystem fixes, but somehow, most of the developers on this thread are fs developers.
BTW, the story of filesystem testing in stable trees has also been improving since your last appearance in LSFMM.
Thanks, Amir.
[1] https://lore.kernel.org/linux-fsdevel/CACzhbgSZUCn-az1e9uCh0+AO314+yq6MJTTbF...
On Sun, Mar 12, 2023 at 10:04:23AM +0200, Amir Goldstein wrote:
On Sat, Mar 11, 2023 at 11:25 PM Sasha Levin sashal@kernel.org wrote:
On Sat, Mar 11, 2023 at 10:54:59AM -0800, Eric Biggers wrote:
...
And yes, I am interested in contributing, but as I mentioned I think you need to first acknowledge that there is a problem, fix your attitude of immediately pushing back on everything, and make it easier for people to contribute.
I don't think we disagree that the process is broken: this is one of the reasons we went away from trying to support 6 year LTS kernels.
However, we are not pushing back on ideas, we are asking for a hand in improving the process: we've been getting drive-by comments quite often, but when it comes to be doing the actual work people are quite reluctant to help.
If you want to sit down and scope out initial set of work around tooling to help here I'm more than happy to do that: I'm planning to be both in OSS and LPC if you want to do it in person, along with anyone else interested in helping out.
Sasha,
Will you be able to attend a session on AUTOSEL on the overlap day of LSFMM and OSS (May 10) or earlier?
We were going to discuss the topic of filesystems and stable trees [1] anyway and I believe the discussion can be even more productive with you around.
I realize that the scope of AUTOSEL is wider than backporting filesystem fixes, but somehow, most of the developers on this thread are fs developers.
BTW, the story of filesystem testing in stable trees has also been improving since your last appearance in LSFMM.
Happy to stop by and collaborate!
I'll also be in Vancouver the whole week (though not in LSF/MM), so if you'd want to find time for a workshop around this topic with interested parties we can look into that too.
On Sat, Mar 11, 2023 at 10:54:59AM -0800, Eric Biggers wrote:
On Sat, Mar 11, 2023 at 01:26:57PM -0500, Sasha Levin wrote:
"job"? do you think I'm paid to do this work?
Why would I stonewall improvements to the process?
I'm getting a bunch of suggestions and complaints that I'm not implementing those suggestions fast enough on my spare time.
One of the first things I would do if I was maintaining the stable kernels is to set up a way to automatically run searches on the mailing lists, and then take advantage of that in the stable process in various ways. Not having that is the root cause of a lot of the issues with the current process, IMO.
"if I was maintaining the stable kernels" - why is this rellevant? give us the tool you've proposed below and we'll be happy to use it. Heck, don't give it to us, use it to review the patches we're sending out for review and let us know if we've missed anything.
It's kind of a stretch to claim that maintaining the stable kernels is not part of your and Greg's jobs. But anyway, the real problem is that it's currently very hard for others to contribute, given the unique role the stable maintainers have and the lack of documentation about it. Each of the two maintainers has their own scripts, and it is not clear how they use them and what processes they follow.
Just a comment here about our scripts and process.
Our scripts are different as we both currently do different things for the stable trees. I have almost no scripts for finding patches, all I use is a git hook that dumps emails into a mbox and then go through them and queue them up to the quilt trees based on if they are valid or not after review.
My scripts primarily are for doing a release, not building the patches up.
That being said, I do have 2 scripts I use to run on an existing tree or series to verify that the fixes are all present already (i.e. if we have fixes for the fixes), but that's not really relevant for this discussion.
Those, and my big "treat the filesystem as a git database" hack can be found in this repo: https://git.sr.ht/~gregkh/linux-stable_commit_tree/ if you are curious, these are probably the relevant scripts if you are curious: https://git.sr.ht/~gregkh/linux-stable_commit_tree/tree/master/item/find_fix... https://git.sr.ht/~gregkh/linux-stable_commit_tree/tree/master/item/find_fix...
And I use: https://git.sr.ht/~gregkh/linux-stable_commit_tree/tree/master/item/id_found... all the time to determine if a SHA1 is in any stable releases.
(Even just stable-kernel-rules.rst is totally incorrect these days.)
I do not understand this, what is not correct?
It's how to get patches merged into stable kernels, we go above-and-beyond that for those developers and maintainers that do NOT follow those rules. If everyone followed them, we wouldn't be having this discussion at all :)
Actually I still don't even know where your scripts are! They are not in stable-queue/scripts, it seems those are only Greg's scripts? And if I built something, how do I know you would even use it? You likely have all sorts of requirements that I don't even know about.
I think what you are talking about here would require new work. New tools to dig in the commits to extract "here's the whole series of patches" would be wonderful, but as others have pointed out, it is _very_ common to have a cc: stable as the first few commits in a series, and then the rest have nothing to do with a stable tree.
But when doing something like what AUTOSEL does, digging up the whole series would be great. We have tools that can match up every commit in the tree to a specific email message (presentations on the tool and how it works have been a previous LinuxCon conferences), but if we can use lore.kernel.org for it, that would probably help everyone out.
And that's why I use the Link: tag, as Ted pointed out, for everything that I apply to all of the subsystems I work with. While I know Linus doesn't like it, I think it is quite valuable as it makes it so that _anyone_ can instantly find the thread where the patch came from, and no external tools are required.
Anyway, as always, I gladly accept help with figuring out what commits to apply to stable kernels. I've always said this, and Sasha has stepped up in an amazing way here over the years, creating tools based on collaboration with many others (see his presentations at conferences with Julia) on how to dig into the kernel repo to find patches that we all forget to tag for stable kernels and he sends them out for review.
If you want to help out and do much the same thing using different sorts of tools, or come up with other ways of finding the bugfixes that are in there that are not properly tagged, wonderful, I will gladly accept them, I have never turned down help like this.
And that's what I ask from companies all the time when they say "what can we do to help out?" A simple thing to do is dig in your vendor trees and send me the fixes that you have backported there. I know distros have this (and some distros help out and do this, I'll call out Debian for being very good here), and some companies do submit their backports as well (Amazon and Hawaii are good, Android also does a good job), but they are rare compared to all of the groups that I know use Linux.
Anyway, if anyone noticed the big problems this weekend with the stable releases were due to patches that were actually tagged with "cc: stable" so that's kind of proof that we all are human and even when we think a fix is enough, it can cause problems when it hits real world testing.
We are all human, the best we can do is when confronted with "hey, this fix causes a problem" is revert it and get the fix out to people as quick as possible. That includes fixes picked from tools like AUTOSEL as well as manual tags, there is no difference here in our response.
thanks,
greg k-h
On Mon, Mar 13, 2023 at 06:41:49PM +0100, Greg KH wrote:
(Even just stable-kernel-rules.rst is totally incorrect these days.)
I do not understand this, what is not correct?
It's how to get patches merged into stable kernels, we go above-and-beyond that for those developers and maintainers that do NOT follow those rules. If everyone followed them, we wouldn't be having this discussion at all :)
The entire list of rules for what patches are accepted into stable. This is a longstanding issue that has been reiterated many times in the past, see https://lore.kernel.org/stable/20220924182124.GA19210@duo.ucw.cz for example.
The fact is, many people *do* follow the rules exactly by *not* tagging commits for stable when they don't meet the documented eligibility criteria. But then the stable maintainers backport the commits anyway, as the real eligibility criteria are *much* more relaxed than what is documented.
- Eric
On Mon, Mar 13, 2023 at 11:54:17AM -0700, Eric Biggers wrote:
The fact is, many people *do* follow the rules exactly by *not* tagging commits for stable when they don't meet the documented eligibility criteria. But then the stable maintainers backport the commits anyway, as the real eligibility criteria are *much* more relaxed than what is documented.
Again, if you do NOT want your patches backported, because you always mark them properly for stable releases, just let us know and we will add you to the list of other subsystems and maintainers that have asked us for this in the past.
We can't detect the absence of a tag as "I know exactly what I am doing" because of the huge number of developers who do NOT tag patches, and so, we have to dig through the tree to find fixes on our own.
thanks,
greg k-h
On Sat, Mar 11, 2023 at 01:26:57PM -0500, Sasha Levin wrote:
I'm getting a bunch of suggestions and complaints that I'm not implementing those suggestions fast enough on my spare time.
BTW, the "I don't have enough time" argument is also a little frustrating because you are currently insisting on doing AUTOSEL at all, at the current sensitivity that picks up way too many commits. I can certainly imagine that that uses a lot of your time! But, many contributors are telling you that AUTOSEL is actually *worse than nothing* currently.
So to some extent this is a self-inflicted problem. You are *choosing* to spend your precious time running in-place with something that is not working well, instead of putting AUTOSEL on pause or turning down the sensitivity to free up time while improvements to the process are worked on.
(And yes, I know there are many stable patches besides AUTOSEL, and it's a lot of work, and I'm grateful for what you do. I am *just* talking about AUTOSEL here. And yes, I agree that AUTOSEL is needed in principle, so there's no need to re-hash the arguments for why it exists. It just needs some improvements.)
- Eric
On Sat, Mar 11, 2023 at 12:17:50PM -0800, Eric Biggers wrote:
On Sat, Mar 11, 2023 at 01:26:57PM -0500, Sasha Levin wrote:
I'm getting a bunch of suggestions and complaints that I'm not implementing those suggestions fast enough on my spare time.
BTW, the "I don't have enough time" argument is also a little frustrating because you are currently insisting on doing AUTOSEL at all, at the current sensitivity that picks up way too many commits. I can certainly imagine that that uses a lot of your time! But, many contributors are telling you that AUTOSEL is actually *worse than nothing* currently.
So to some extent this is a self-inflicted problem. You are *choosing* to spend your precious time running in-place with something that is not working well, instead of putting AUTOSEL on pause or turning down the sensitivity to free up time while improvements to the process are worked on.
(And yes, I know there are many stable patches besides AUTOSEL, and it's a lot of work, and I'm grateful for what you do. I am *just* talking about AUTOSEL here. And yes, I agree that AUTOSEL is needed in principle, so there's no need to re-hash the arguments for why it exists. It just needs some improvements.)
Just to make sure I'm sending the right message: I'd *love* to improve it, but I need help. I'm not pushing back on your ideas, I'm asking for help with their implementation.
Maybe I'm putting words in Greg's mouth, but I think we both would ideally want to standardize around a single set of tools and scripts, it's just the case that both of us started with different set of problems we were trying to solve, and so our tooling evolved independently.
On Sat, Mar 11, 2023 at 04:02:15PM -0500, Sasha Levin wrote:
Maybe I'm putting words in Greg's mouth, but I think we both would ideally want to standardize around a single set of tools and scripts, it's just the case that both of us started with different set of problems we were trying to solve, and so our tooling evolved independently.
I'm not even sure this is strictly necessary. When I started jumping on 2.6 Greg told me "now you have to follow the new process involving a first preview release", and that completely changed the way I was dealing with fixes in 2.4. I then tried to adopt Greg's scripts, and when you do that you instantly figure that different people are at ease with different parts of the process, and over time I started to keep a collection of hacked scripts that would simplify certain parts of the process for me (stuff as stupid as creating directories having the kernel name in hard-coded paths in my work directory, or converting the "cherry-picked from" to the "upstream commit" format, etc).
Your biggest difficulty probably is to find people willing and able to help, and while you should definitely show them your collection of tools to help them get started, these can also be frightening for newcomers or possibly not much useful to them if they come from a different background.
Cheers, Willy
On Sat, Mar 11, 2023 at 09:48:13AM -0800, Eric Biggers wrote:
The purpose of all these mailing list searches would be to generate a list of potential issues with backporting each commit, which would then undergo brief human review.
This is one big part that I suspect is underestimated. I'll speak from my past experience maintaining extended LTS for 3.10. I couldn't produce as many releases as I would have liked to because despite the scripts that helped me figure some series, some dependencies, origin branches etc, the whole process of reviewing ~600 patches to end up with ~200 at the end (and adapting some of them to fit) required ~16 hours a day for a full week-end, and I didn't always have that amount of time available. Any my choices were far from being perfect, as during the reviews I got a number of "please don't backport this there" and "if you take this one you also need these ones". Also I used to intentionally drop what had nothing to do on old LTS stuff so even from that perspective my work could have been perceived as insufficient.
The reviewing process is overwhelming, really. There is a point where you start to fail and make choices that are not better than a machine's. But is a mistake once in a while dramatic if on the other hand it fixes 200 other issues ? I think not as long as it's transparent and accepted by the users, because for one user that could experience a regression (one that escaped all the testing in place), thousands get fixes for existing problems. I'm not saying that regressions are good, I hate them, but as James said, we have to accept that user are part of the quality process.
My approach on another project I maintain is to announce upfront my own level of trust in my backport work, saying "I had a difficult week fixing that problem, do not rush on it or be extra careful", or "nothing urgent, no need to upgrade if you have no problem" or also "just upgrade, it's almost riskless". Users love that, because they know they're part of the quality assurance process, and they will either take small risks when they can, or wait for others to take risks.
But thinking that having one person review patches affecting many subsystem after pre-selection and extra info regarding discussions on each individual patch could result in more reliable stable releases is just an illusion IMHO, because the root of problem is that there are not enough humans to fix all the problems that humans introduce in the first place, and despite this we need to fix them. Just like automated scripts scraping lore, AUTOSEL does bring some value if it offloads some work from the available humans, even in its current state. And I hope that more of the selection and review work in the future will be automated and even less dependent on humans, because it does have a chance to be more reliable in front of that vast amount of work.
And in any case I've seen you use the word "trivial" several times in this thread, and for having been through a little bit of this process in the past, I wouldn't use that word anywhere in a description of what my experience had been. You really seem to underestimate the difficulty here.
Regards, Willy
On Sat, Mar 11, 2023 at 07:33:58PM +0100, Willy Tarreau wrote:
On Sat, Mar 11, 2023 at 09:48:13AM -0800, Eric Biggers wrote:
The purpose of all these mailing list searches would be to generate a list of potential issues with backporting each commit, which would then undergo brief human review.
This is one big part that I suspect is underestimated. I'll speak from my past experience maintaining extended LTS for 3.10. I couldn't produce as many releases as I would have liked to because despite the scripts that helped me figure some series, some dependencies, origin branches etc, the whole process of reviewing ~600 patches to end up with ~200 at the end (and adapting some of them to fit) required ~16 hours a day for a full week-end, and I didn't always have that amount of time available. Any my choices were far from being perfect, as during the reviews I got a number of "please don't backport this there" and "if you take this one you also need these ones". Also I used to intentionally drop what had nothing to do on old LTS stuff so even from that perspective my work could have been perceived as insufficient.
The reviewing process is overwhelming, really. There is a point where you start to fail and make choices that are not better than a machine's. But is a mistake once in a while dramatic if on the other hand it fixes 200 other issues ? I think not as long as it's transparent and accepted by the users, because for one user that could experience a regression (one that escaped all the testing in place), thousands get fixes for existing problems. I'm not saying that regressions are good, I hate them, but as James said, we have to accept that user are part of the quality process.
My approach on another project I maintain is to announce upfront my own level of trust in my backport work, saying "I had a difficult week fixing that problem, do not rush on it or be extra careful", or "nothing urgent, no need to upgrade if you have no problem" or also "just upgrade, it's almost riskless". Users love that, because they know they're part of the quality assurance process, and they will either take small risks when they can, or wait for others to take risks.
But thinking that having one person review patches affecting many subsystem after pre-selection and extra info regarding discussions on each individual patch could result in more reliable stable releases is just an illusion IMHO, because the root of problem is that there are not enough humans to fix all the problems that humans introduce in the first place, and despite this we need to fix them. Just like automated scripts scraping lore, AUTOSEL does bring some value if it offloads some work from the available humans, even in its current state. And I hope that more of the selection and review work in the future will be automated and even less dependent on humans, because it does have a chance to be more reliable in front of that vast amount of work.
As I said in a part of my email which you did not quote, the fallback option is to send the list of issues to the mailing list for others to review.
If even that fails, then it could be cut down to the *just the most useful* heuristics and decisions made automatically based on those... "Don't AUTOSEL patch N of a series without 1...N-1" might be a good one.
But again, this comes back to one of the core issues here which is how does one even build something for the stable maintainers if their requirements are unknown to others?
And in any case I've seen you use the word "trivial" several times in this thread, and for having been through a little bit of this process in the past, I wouldn't use that word anywhere in a description of what my experience had been. You really seem to underestimate the difficulty here.
I checked the entire email thread (https://lore.kernel.org/stable/?q=f%3Aebiggers+trivial). The only place I used the word "trivial" was mentioning that querying lore.kernel.org from a Python script might be trivial, which is true. And also in my response to Sasha's similar false claim that I was saying everything would be trivial.
I'm not sure why you're literally just making things up; it's not a very good way to have a productive discussion...
- Eric
On Sat, Mar 11, 2023 at 11:24:31AM -0800, Eric Biggers wrote:
On Sat, Mar 11, 2023 at 07:33:58PM +0100, Willy Tarreau wrote:
On Sat, Mar 11, 2023 at 09:48:13AM -0800, Eric Biggers wrote:
The purpose of all these mailing list searches would be to generate a list of potential issues with backporting each commit, which would then undergo brief human review.
This is one big part that I suspect is underestimated. I'll speak from my past experience maintaining extended LTS for 3.10. I couldn't produce as many releases as I would have liked to because despite the scripts that helped me figure some series, some dependencies, origin branches etc, the whole process of reviewing ~600 patches to end up with ~200 at the end (and adapting some of them to fit) required ~16 hours a day for a full week-end, and I didn't always have that amount of time available. Any my choices were far from being perfect, as during the reviews I got a number of "please don't backport this there" and "if you take this one you also need these ones". Also I used to intentionally drop what had nothing to do on old LTS stuff so even from that perspective my work could have been perceived as insufficient.
The reviewing process is overwhelming, really. There is a point where you start to fail and make choices that are not better than a machine's. But is a mistake once in a while dramatic if on the other hand it fixes 200 other issues ? I think not as long as it's transparent and accepted by the users, because for one user that could experience a regression (one that escaped all the testing in place), thousands get fixes for existing problems. I'm not saying that regressions are good, I hate them, but as James said, we have to accept that user are part of the quality process.
My approach on another project I maintain is to announce upfront my own level of trust in my backport work, saying "I had a difficult week fixing that problem, do not rush on it or be extra careful", or "nothing urgent, no need to upgrade if you have no problem" or also "just upgrade, it's almost riskless". Users love that, because they know they're part of the quality assurance process, and they will either take small risks when they can, or wait for others to take risks.
But thinking that having one person review patches affecting many subsystem after pre-selection and extra info regarding discussions on each individual patch could result in more reliable stable releases is just an illusion IMHO, because the root of problem is that there are not enough humans to fix all the problems that humans introduce in the first place, and despite this we need to fix them. Just like automated scripts scraping lore, AUTOSEL does bring some value if it offloads some work from the available humans, even in its current state. And I hope that more of the selection and review work in the future will be automated and even less dependent on humans, because it does have a chance to be more reliable in front of that vast amount of work.
As I said in a part of my email which you did not quote, the fallback option is to send the list of issues to the mailing list for others to review.
If even that fails, then it could be cut down to the *just the most useful* heuristics and decisions made automatically based on those... "Don't AUTOSEL patch N of a series without 1...N-1" might be a good one.
But again, this comes back to one of the core issues here which is how does one even build something for the stable maintainers if their requirements are unknown to others?
Another issue that I'd like to reiterate is that AUTOSEL is currently turned up to 11. It's simply selecting too much.
It should be made less sensitive and select higher confidence commits only.
That would cut down on the workload slightly.
(And please note, the key word here is *confidence*. We all agree that it's never possible to be absolutely 100% sure whether a commit is appropriate for stable or not. That's a red herring.
And I would assume, or at least hope, that the neural network thing being used for AUTOSEL outputs a confidence rating and not just a yes/no answer. If it actually just outputs yes/no, well how is anyone supposed to know that and fix that, given that it does not seem to be an open source project?)
- Eric
On Sat, Mar 11, 2023 at 11:46:05AM -0800, Eric Biggers wrote:
(And please note, the key word here is *confidence*. We all agree that it's never possible to be absolutely 100% sure whether a commit is appropriate for stable or not. That's a red herring.
In fact even developers themselves sometimes don't know, and even when they know, sometimes they know after committing it. Many times we've found that a bug was accidently resolved by a small change. Just for this it's important to support a post-merge analysis.
And I would assume, or at least hope, that the neural network thing being used for AUTOSEL outputs a confidence rating and not just a yes/no answer. If it actually just outputs yes/no, well how is anyone supposed to know that and fix that, given that it does not seem to be an open source project?)
Honestly I don't know. I ran a few experiments with natural language processors such as GPT-3 on commit messages which contained human-readable instructions, and asking "what am I expected to do with these patches", and seeing the bot respond "you should backport them to this version, change this and that in that version, and preliminary take that patch". It summarized extremely well the instructions delivered by the developer, which is awesome, but was not able to provide any form of confidence level. I don't know what Sasha uses but wouldn't be surprised it shares some such mechanisms and that it might not always be easy to get such a confidence level. But I could be wrong.
Willy
On Sat, Mar 11, 2023 at 09:19:54PM +0100, Willy Tarreau wrote:
On Sat, Mar 11, 2023 at 11:46:05AM -0800, Eric Biggers wrote:
(And please note, the key word here is *confidence*. We all agree that it's never possible to be absolutely 100% sure whether a commit is appropriate for stable or not. That's a red herring.
In fact even developers themselves sometimes don't know, and even when they know, sometimes they know after committing it. Many times we've found that a bug was accidently resolved by a small change. Just for this it's important to support a post-merge analysis.
And I would assume, or at least hope, that the neural network thing being used for AUTOSEL outputs a confidence rating and not just a yes/no answer. If it actually just outputs yes/no, well how is anyone supposed to know that and fix that, given that it does not seem to be an open source project?)
Honestly I don't know. I ran a few experiments with natural language processors such as GPT-3 on commit messages which contained human-readable instructions, and asking "what am I expected to do with these patches", and seeing the bot respond "you should backport them to this version, change this and that in that version, and preliminary take that patch". It summarized extremely well the instructions delivered by the developer, which is awesome, but was not able to provide any form of confidence level. I don't know what Sasha uses but wouldn't be surprised it shares some such mechanisms and that it might not always be easy to get such a confidence level. But I could be wrong.
It's actually pretty stupid: it uses the existence of ~10k of the most common words in commit messages + metrics from cqmetrics (github.com/dspinellis/cqmetrics) as input.
Although I get a score, which is already set pretty high, confidence is really non-existant here: at the end it depends mostly on the writing style of said commit author more than anything.
On Sat, Mar 11, 2023 at 11:24:31AM -0800, Eric Biggers wrote:
But thinking that having one person review patches affecting many subsystem after pre-selection and extra info regarding discussions on each individual patch could result in more reliable stable releases is just an illusion IMHO, because the root of problem is that there are not enough humans to fix all the problems that humans introduce in the first place, and despite this we need to fix them. Just like automated scripts scraping lore, AUTOSEL does bring some value if it offloads some work from the available humans, even in its current state. And I hope that more of the selection and review work in the future will be automated and even less dependent on humans, because it does have a chance to be more reliable in front of that vast amount of work.
As I said in a part of my email which you did not quote, the fallback option is to send the list of issues to the mailing list for others to review.
Honestly, patches are already being delivered publicly tagged AUTOSEL, then published again as part of the stable review process. Have you seen the amount of feedback ? Once in a while there are responses, but aside Guenter reporting build successes or failures, it's a bit quiet. What makes you think that sending more detailed stuff that require even more involvement and decision would trigger more participation ?
If even that fails, then it could be cut down to the *just the most useful* heuristics and decisions made automatically based on those... "Don't AUTOSEL patch N of a series without 1...N-1" might be a good one.
I do think that this one would be an improvement. But it needs to push harder. Not "don't autosel", but sending the message to relevant parties (all those involved in the patch being reviewed and merged) indicating "we are going to merge this patch, but it's part of the following series, should any/all/none of them be picked ? barring any response only this patch will be picked". And of course, ideally all selected ones from a series should be proposed at once to ease the review.
But again, this comes back to one of the core issues here which is how does one even build something for the stable maintainers if their requirements are unknown to others?
Well, the description of the commit message is there for anyone to consume in the first place. A commit message is an argument for a patch to get adopted and resist any temptations to revert it. So it must be descriptive enough and give instructions. Dependencies should be clear there. When you seen github-like one-liners there's no hope to get much info, and it's not a matter of requirements, but of respect for a team development process where some facing your patch might miss the skills required to grasp the details. With a sufficiently clear commit message, even a bot can find (or suggest) dependencies. And this is not specific to -stable: if one of the dependencies is found to break stuff, how do you know it must not be reverted without reverting the whole series if that's not described anywhere ?
And in any case I've seen you use the word "trivial" several times in this thread, and for having been through a little bit of this process in the past, I wouldn't use that word anywhere in a description of what my experience had been. You really seem to underestimate the difficulty here.
I checked the entire email thread (https://lore.kernel.org/stable/?q=f%3Aebiggers+trivial). The only place I used the word "trivial" was mentioning that querying lore.kernel.org from a Python script might be trivial, which is true.
I'm unable to do it, so at best it's trivial for someone at ease with Python and the lore API. And parsing the results and classifying them might not be trivial at all either. Getting information is one part, processing it is another thing.
And also in my response to Sasha's similar false claim that I was saying everything would be trivial.
I'm not sure why you're literally just making things up; it's not a very good way to have a productive discussion...
I'm not making things up. Maybe you wrote "trivial" only once but the tone of your suggestions from the beginning was an exact description of something called trivial and made me feel you find all of this "trivial", which you finally confirmed in that excerpt above.
Quite frankly, I'm not part of this process anymore and am really thankful that the current maintainers are doing that work. But it makes me feel really uneasy to read suggestions basically sounding like "why don't you fix your broken selection process" or "it should just be as trivial as collecting the missing info from lore". Had I received such contemptuous "suggestions" when I was doing that job, I would just have resigned. And just saying things like "I will not start helping before you change your attitude" you appear infantilizing at best and in no way looking like you're really willing to help. Sasha said he was open to receive proposals and suddenly the trivial job gets conditions. Just do your part of the work that seems poorly done to you, and everyone will see if your ideas and work finally helped or not. Nobody will even care if it was trivial or if it ended up taking 4 months of refining, as long as it helps in the end. But I suspect that you're not interested in helping, just in complaining.
One thing I think that could be within reach and could very slightly improve the process would be to indicate in a stable announce the amount of patches coming from autosel. I think that it could help either refining the selection by making users more conscious about the importance of this source, or encourage more developers to Cc stable to reduce that ratio. Just an idea.
Willy
On Sat, Mar 11, 2023 at 09:11:51PM +0100, Willy Tarreau wrote:
On Sat, Mar 11, 2023 at 11:24:31AM -0800, Eric Biggers wrote:
But thinking that having one person review patches affecting many subsystem after pre-selection and extra info regarding discussions on each individual patch could result in more reliable stable releases is just an illusion IMHO, because the root of problem is that there are not enough humans to fix all the problems that humans introduce in the first place, and despite this we need to fix them. Just like automated scripts scraping lore, AUTOSEL does bring some value if it offloads some work from the available humans, even in its current state. And I hope that more of the selection and review work in the future will be automated and even less dependent on humans, because it does have a chance to be more reliable in front of that vast amount of work.
As I said in a part of my email which you did not quote, the fallback option is to send the list of issues to the mailing list for others to review.
Honestly, patches are already being delivered publicly tagged AUTOSEL, then published again as part of the stable review process. Have you seen the amount of feedback ? Once in a while there are responses, but aside Guenter reporting build successes or failures, it's a bit quiet. What makes you think that sending more detailed stuff that require even more involvement and decision would trigger more participation ?
Well, there is not much people can do with 1000 patches with no context, but if there's a much shorter list of potential issues to pay attention to, I'd hope that would be helpful.
I checked the entire email thread (https://lore.kernel.org/stable/?q=f%3Aebiggers+trivial). The only place I used the word "trivial" was mentioning that querying lore.kernel.org from a Python script might be trivial, which is true.
I'm unable to do it, so at best it's trivial for someone at ease with Python and the lore API. And parsing the results and classifying them might not be trivial at all either. Getting information is one part, processing it is another thing.
And also in my response to Sasha's similar false claim that I was saying everything would be trivial.
I'm not sure why you're literally just making things up; it's not a very good way to have a productive discussion...
I'm not making things up. Maybe you wrote "trivial" only once but the tone of your suggestions from the beginning was an exact description of something called trivial and made me feel you find all of this "trivial", which you finally confirmed in that excerpt above.
You are indeed making things up, which is annoying as it makes it hard to have a real discussion. Anyway, hopefully we can get past that.
Quite frankly, I'm not part of this process anymore and am really thankful that the current maintainers are doing that work. But it makes me feel really uneasy to read suggestions basically sounding like "why don't you fix your broken selection process" or "it should just be as trivial as collecting the missing info from lore". Had I received such contemptuous "suggestions" when I was doing that job, I would just have resigned. And just saying things like "I will not start helping before you change your attitude" you appear infantilizing at best and in no way looking like you're really willing to help. Sasha said he was open to receive proposals and suddenly the trivial job gets conditions. Just do your part of the work that seems poorly done to you, and everyone will see if your ideas and work finally helped or not. Nobody will even care if it was trivial or if it ended up taking 4 months of refining, as long as it helps in the end. But I suspect that you're not interested in helping, just in complaining.
One thing I think that could be within reach and could very slightly improve the process would be to indicate in a stable announce the amount of patches coming from autosel. I think that it could help either refining the selection by making users more conscious about the importance of this source, or encourage more developers to Cc stable to reduce that ratio. Just an idea.
I'll try to put something together, despite all the pushback I'm getting. But by necessity it will be totally separate from the current stable scripts, as it seems there is no practical way for me to do it otherwise, given that the current stable process is not properly open and lacks proper leadership.
- Eric
On Sat, Mar 11, 2023 at 12:53:29PM -0800, Eric Biggers wrote:
I'll try to put something together, despite all the pushback I'm getting.
Thanks.
But by necessity it will be totally separate from the current stable scripts, as it seems there is no practical way for me to do it otherwise,
It's better that way anyway. Adding diversity to the process is important if we want to experiment with multiple approaches. What matters is to have multiple inputs on list of patches.
given that the current stable process is not properly open and lacks proper leadership.
Please, really please, stop looping on this. I think it was already explained quite a few times that the process is mostly human, and that it's very difficult to document what has to be done. It's a lot of work based on common sense, intuition and experience which helps solving each an every individual case. The scripts that help are public, the rest is just experience. It's not fair to say that some people do not follow an open process while they're using their experience and intuition. They're not machines.
Thanks, Willy
On Sun, Mar 12, 2023 at 05:32:32AM +0100, Willy Tarreau wrote:
On Sat, Mar 11, 2023 at 12:53:29PM -0800, Eric Biggers wrote:
I'll try to put something together, despite all the pushback I'm getting.
Thanks.
But by necessity it will be totally separate from the current stable scripts, as it seems there is no practical way for me to do it otherwise,
It's better that way anyway. Adding diversity to the process is important if we want to experiment with multiple approaches. What matters is to have multiple inputs on list of patches.
given that the current stable process is not properly open and lacks proper leadership.
Please, really please, stop looping on this. I think it was already explained quite a few times that the process is mostly human, and that it's very difficult to document what has to be done. It's a lot of work based on common sense, intuition and experience which helps solving each an every individual case. The scripts that help are public, the rest is just experience. It's not fair to say that some people do not follow an open process while they're using their experience and intuition. They're not machines.
I mean, "patches welcome" is a bit pointless when there is nothing to patch, is it not? Even Sasha's stable-tools, which he finally gave a link to, does not include anything related to AUTOSEL. It seems AUTOSEL is still closed source.
BTW, I already did something similar "off to the side" a few years ago when I wrote a script to keep track of and prioritize syzbot reports from https://syzkaller.appspot.com/, and generate per-subsystem reminder emails.
I eventually ended up abandoning that, because doing something off to the side is not very effective and is hard to keep up with. The right approach is to make improvements to the "upstream" process (which was syzbot in that case), not to bolt something on to the side to try to fix it after the fact.
So I hope people can understand where I'm coming from, with hoping that what the stable maintainers are doing can just be improved directly, without first building something from scratch off to the side as that is just not a good way to do things. But sure, if that's the only option to get anything nontrivial changed, I'll try to do it.
- Eric
On Sat, Mar 11, 2023 at 09:21:58PM -0800, Eric Biggers wrote:
I mean, "patches welcome" is a bit pointless when there is nothing to patch, is it not?
Maybe it's because they're used to regularly receive complaints suggesting to improve the process without knowing where to start from.
Even Sasha's stable-tools, which he finally gave a link to, does not include anything related to AUTOSEL. It seems AUTOSEL is still closed source.
I don't know.
BTW, I already did something similar "off to the side" a few years ago when I wrote a script to keep track of and prioritize syzbot reports from https://syzkaller.appspot.com/, and generate per-subsystem reminder emails.
I eventually ended up abandoning that, because doing something off to the side is not very effective and is hard to keep up with. The right approach is to make improvements to the "upstream" process (which was syzbot in that case), not to bolt something on to the side to try to fix it after the fact.
I think that improving the upstreaming process does have some value, of course, especially when it's done from tools that can reliably and durably be improved. But it's not rocket science when input comes from humans. We still occasionally see patches missing an s-o-b. Humans can't be fixed, and the more the constraints that are added on them, the higher the failure rate they will show. However, regardless of the detailed knowledge of how to format this or that tag, it should still be possible for any patch author to answer the question "what do you want us to do with that patch". If most of the patches at least contain such info, it already becomes possible to figure after it gets merged whether the intent was to get it backported or not. It's not trivial but NLP and code analysis definitely help on this. And there will always be some patches whose need for backporting will be detected after the merge. That's why I'm seeing a lot of value in the post-processing part, because for me trying to fix the input will have a very limited effect.
And let's face it, if a patch series gets merged, it means that at some point someone understood what to do with it, so all the needed information was already there, given a certain context. The cost is thousands of brains needed to decode that. But with the improvements in language processing, we should at some point be able to release some brains with more-or-less similar results and try to lower the barrier to contribution by relaxing patch submission rules instead of adding more. My feeling is that it's what we should aim for given that the number of maintainers and contributors doesn't seem to grow as fast as the code size and complexity.
That's where I'm seeing a lot of value in the AUTOSEL work. If it can show the direction to something better so that in 3 or 4 years we can think back and say "remember the garbage it was compared to what we have now", I think it will have been a fantastic success.
So I hope people can understand where I'm coming from, with hoping that what the stable maintainers are doing can just be improved directly, without first building something from scratch off to the side as that is just not a good way to do things.
It's generally not good but here there's little to start from, and experimenting with your ideas on LKML threads or commit series can be a nice way to explore completely different approaches without being limited by what currently exists. Maybe you'll end up with something completely orthogonal and the combination of both of your solutions will already be a nice improvement. who knows.
But sure, if that's the only option to get anything nontrivial changed, I'll try to do it.
Thanks! Willy
On Sun, Mar 12, 2023 at 7:41 AM Eric Biggers ebiggers@kernel.org wrote:
...
I mean, "patches welcome" is a bit pointless when there is nothing to patch, is it not? Even Sasha's stable-tools, which he finally gave a link to, does not include anything related to AUTOSEL. It seems AUTOSEL is still closed source.
BTW, I already did something similar "off to the side" a few years ago when I wrote a script to keep track of and prioritize syzbot reports from https://syzkaller.appspot.com/, and generate per-subsystem reminder emails.
I eventually ended up abandoning that, because doing something off to the side is not very effective and is hard to keep up with. The right approach is to make improvements to the "upstream" process (which was syzbot in that case), not to bolt something on to the side to try to fix it after the fact.
So I hope people can understand where I'm coming from, with hoping that what the stable maintainers are doing can just be improved directly, without first building something from scratch off to the side as that is just not a good way to do things. But sure, if that's the only option to get anything nontrivial changed, I'll try to do it.
Eric,
Did you consider working to improve or add functionality to b4?
Some of the things that you proposed sound like a natural extension of b4 that stable maintainers could use if they turn out to be useful.
Some of the things in your wish list, b4 already does - it has a local cache of messages from query results, it can find the latest submission of a patch series.
When working on backporting xfs patches to LTS, I ended up trying to improve and extend b4 [1].
My motivation was to retrieve the information provided in the pull request and cover letters which often have much better context to understand whether patches are stable material.
My motivation was NOT to improve automatic scripts, but I did make some improvement to b4 that could benefit automatic scripts as well.
Alas, despite sending a pull request via github and advertising my work and its benefits on several occasions, I got no feedback from Konstantin nor from any other developers, so I did not pursue upstreaming.
If you find any part of this work relevant, I can try to rebase and post my b4 patches.
Thanks, Amir.
On Sun, Mar 12, 2023 at 09:42:59AM +0200, Amir Goldstein wrote:
Alas, despite sending a pull request via github and advertising my work and its benefits on several occasions, I got no feedback from Konstantin nor from any other developers, so I did not pursue upstreaming.
If you find any part of this work relevant, I can try to rebase and post my b4 patches.
...
b4 development is mainly done via email on the tools@linux.kernel.org list, and https://git.kernel.org/pub/scm/utils/b4/b4.git rather than that github repository (note that yours is the first and only pull request there) is the main repo. I suspect that github repo is just an automatically maintained mirror and nobody's seen your pull request, you'd be much more likely to get a response sending your patches to the list CC Konstantin.
On Sun, Mar 12, 2023 at 09:42:59AM +0200, Amir Goldstein wrote:
On Sun, Mar 12, 2023 at 7:41 AM Eric Biggers ebiggers@kernel.org wrote:
...
I mean, "patches welcome" is a bit pointless when there is nothing to patch, is it not? Even Sasha's stable-tools, which he finally gave a link to, does not
"finally" being all the way back in 2015 (https://lkml.org/lkml/2015/11/9/422), and getting no contributions from 3rd parties for the next 7 years.
Really, we're not pushing back on ideas, we're just saying that this is something has happened before ("open up your scripts so we can reuse/improve") and getting nowhere.
include anything related to AUTOSEL. It seems AUTOSEL is still closed source.
Not as much as closed source as a complete mess on a VM I'm afraid to lose.
I didn't really want to try and invest the effort into extracting it out becuase:
1. It's one of the first things I did when I started learning about neural networks, and in practice it's inefficient and could use a massive overhaul (or rewrite). For example, it currently has ~15k inputs, which means that it needs a beefy GPU to be able to run on (it won't run on your home GPU)..
2. At that time I wasn't familiar with coding for NN either, so it's a mess of LUA code to run under Torch (yes, not even pytorch).
3. It's very tied to how VMs on Azure operate, and could use a lot of abstraction.
So really, I'd much rather invest effort into rewriting this mess rather than digging it out of the crevices of the VM it's sitting in.
BTW, I already did something similar "off to the side" a few years ago when I wrote a script to keep track of and prioritize syzbot reports from https://syzkaller.appspot.com/, and generate per-subsystem reminder emails.
I eventually ended up abandoning that, because doing something off to the side is not very effective and is hard to keep up with. The right approach is to make improvements to the "upstream" process (which was syzbot in that case), not to bolt something on to the side to try to fix it after the fact.
So I hope people can understand where I'm coming from, with hoping that what the stable maintainers are doing can just be improved directly, without first building something from scratch off to the side as that is just not a good way to do things. But sure, if that's the only option to get anything nontrivial changed, I'll try to do it.
Eric,
Did you consider working to improve or add functionality to b4?
FTR, I'm happy to shift investment into tooling for stable maintenance into b4.
On Sat, Mar 11, 2023 at 09:11:51PM +0100, Willy Tarreau wrote:
On Sat, Mar 11, 2023 at 11:24:31AM -0800, Eric Biggers wrote:
As I said in a part of my email which you did not quote, the fallback option is to send the list of issues to the mailing list for others to review.
Honestly, patches are already being delivered publicly tagged AUTOSEL, then published again as part of the stable review process. Have you seen the amount of feedback ? Once in a while there are responses, but aside Guenter reporting build successes or failures, it's a bit quiet. What makes you think that sending more detailed stuff that require even more involvement and decision would trigger more participation ?
TBH as someone getting copied on the AUTOSEL mails I think if the volume of backports is going to say the same what I'd really like is something that mitigates the volume of mail, or at least makes the mails that are being sent more readily parseable. Things that add more context to what's being sent would probably help a lot, right now I'm not really able to do much more than scan for obviously harmful things.
But again, this comes back to one of the core issues here which is how does one even build something for the stable maintainers if their requirements are unknown to others?
Well, the description of the commit message is there for anyone to consume in the first place. A commit message is an argument for a patch to get adopted and resist any temptations to revert it. So it must be descriptive enough and give instructions. Dependencies should be clear there. When you seen github-like one-liners there's no hope to get much info, and it's not a matter of requirements, but of respect for a team development process where some facing your patch might miss the skills required to grasp the details. With a sufficiently clear commit message, even a bot can find (or suggest) dependencies. And this is not specific to -stable: if one of the dependencies is found to break stuff, how do you know it must not be reverted without reverting the whole series if that's not described anywhere ?
I'd say that the most common class of missing dependency I've seen is on previously applied code which is much less likely to be apparent in the commit message and probably not noticed unless it causes a cherry pick or build issue.
One thing I think that could be within reach and could very slightly improve the process would be to indicate in a stable announce the amount of patches coming from autosel. I think that it could help either refining the selection by making users more conscious about the importance of this source, or encourage more developers to Cc stable to reduce that ratio. Just an idea.
I'm not sure if it's the ratio that's the issue here, if anything I'd expect that lowering the ratio would make people more stressed by AUTOSEL since assuming a similar volume of patches get picked overall it would increase the percentage of the AUTOSEL patches that have problems.
...
I suppose that if I had a way to know if a certain a commit is part of a series, I could either take all of it or none of it, but I don't think I have a way of doing that by looking at a commit in Linus' tree (suggestions welcome, I'm happy to implement them).
Could something in git (eg when patches get applied) add dependencies between the patches in a series. While that won't force the entire series be added, it would ensure that all earlier patches are added.
David
- Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Sat, Mar 11, 2023 at 10:38:10PM +0000, David Laight wrote:
...
I suppose that if I had a way to know if a certain a commit is part of a series, I could either take all of it or none of it, but I don't think I have a way of doing that by looking at a commit in Linus' tree (suggestions welcome, I'm happy to implement them).
Could something in git (eg when patches get applied) add dependencies between the patches in a series. While that won't force the entire series be added, it would ensure that all earlier patches are added.
I suspect that having an option to keep the message ID in the footer (a bit like the "cherry-picked from" tag but instead "blongs to series") could possibly help. And when no such info is present we could have one ID generated per "git am" execution since usually if you apply an mbox, it constitutes a series (but not always of course, though it's not difficult to arrange series like this).
Willy
On Sun, Mar 12, 2023 at 05:41:07AM +0100, Willy Tarreau wrote:
I suspect that having an option to keep the message ID in the footer (a bit like the "cherry-picked from" tag but instead "blongs to series") could possibly help. And when no such info is present we could have one ID generated per "git am" execution since usually if you apply an mbox, it constitutes a series (but not always of course, though it's not difficult to arrange series like this).
As I pointed out earlier, some of us are adding the message ID in the footer alrady, using a Link tag. This is even documented already in the Kernel Maintainer's Handbook, so I'm pretty sure it's not just me. :-)
https://www.kernel.org/doc/html/latest/maintainer/configure-git.html#creatin...
This is quite sufficient to extract out the full patch series given a particular patch. The b4 python script does this this; given a single Message-Id, it can find all of the other patches in the series. I won't say that it it's "trivial", but the code already exists, and you can copy and paste it from b4. Or just have your script shell out to "b4 am -o /tmp/scratchdir $MSGID"
- Ted
On Sun 12-03-23 00:09:47, Theodore Ts'o wrote:
On Sun, Mar 12, 2023 at 05:41:07AM +0100, Willy Tarreau wrote:
I suspect that having an option to keep the message ID in the footer (a bit like the "cherry-picked from" tag but instead "blongs to series") could possibly help. And when no such info is present we could have one ID generated per "git am" execution since usually if you apply an mbox, it constitutes a series (but not always of course, though it's not difficult to arrange series like this).
As I pointed out earlier, some of us are adding the message ID in the footer alrady, using a Link tag. This is even documented already in the Kernel Maintainer's Handbook, so I'm pretty sure it's not just me. :-)
Yeah, given Linus' rants about links pointing to patch posting, what I'm currently doing is that I add Message-ID: tag to the patch instead of a Link: tag. It preserves the information as well and it is obvious to human reader what are links to reports / discussions and what is just a link to patch posting.
Honza
On Mon, Feb 27, 2023 at 09:47:46AM -0800, Eric Biggers wrote:
One of the challanges here is that it's difficult to solicit reviews or really any interaction from authors after a commit lands upstream. Look at the response rates to Greg's "FAILED" emails that ask authors to provide backports to commits they tagged for stable.
Well, it doesn't help that most of the stable emails aren't sent to the subsystem's mailing list, but instead just to the individual people mentioned in the commit. So many people who would like to help never know about it.
Let me joining in the discussion.
In this case, Greg send these FAILED emails to upstream commit authors, but some of them have expectation that their commits can be backported automatically (without conflicts), so when Greg ask them to "manually" do the backport (and with resolving conflicts between), they have little to no idea on what should they do. That's why there was a doc patch sent [1] specifically on this matter. Fortunately, some others are aware on this and send the backport.
I don't know, is it obvious? You've said in the past that sometimes you'd like to backport a commit even if the maintainer objects and/or it is known buggy. https://lore.kernel.org/stable/d91aaff1-470f-cfdf-41cf-031eea9d6aca@mailbox.... also didn't explicitly say "Don't backport this" but instead "This patch has issues", so maybe that made a difference?
Anyway, the fact is that it happened. And if it happened in the one bug that I happened to look at because it personally affected me and I spent hours bisecting, it probably is happening in lots of other cases too. So it seems the process is not working...
Separately from responses to the AUTOSEL email, it also seems that you aren't checking for any reported regressions or pending fixes for a commit before backporting it. Simply searching lore for the commit title https://lore.kernel.org/all/?q=%22drm%2Famdgpu%3A+use+dirty+framebuffer+help... would have turned up the bug report https://lore.kernel.org/dri-devel/20220918120926.10322-1-user@am64/ that bisected a regression to that commit, as well as a patch that Fixes that commit: https://lore.kernel.org/all/20220920130832.2214101-1-alexander.deucher@amd.c... Both of these existed before you even sent the AUTOSEL email!
So to summarize, that buggy commit was backported even though:
- There were no indications that it was a bug fix (and thus potentially suitable for stable) in the first place.
- On the AUTOSEL thread, someone told you the commit is broken.
- There was already a thread that reported a regression caused by the commit. Easily findable via lore search.
- There was also already a pending patch that Fixes the commit. Again easily findable via lore search.
Recently there was a regression in linux-5.15.y that was caused by faulty backport of upstream 85636167e3206c ("drm/i915: Don't use BAR mappings for ring buffers with LLC") as 4eb6789f9177a5 ("drm/i915: Don't use BAR mappings for ring buffers with LLC"). Fortunately, the upstream commit wasn't AUTOSEL'd by Sasha's scripts and the culprit backport got reverted.
[CC'ed the upstream commit author and corresponding ML.]
So it seems a *lot* of things went wrong, no? Why? If so many things can go wrong, it's not just a "mistake" but rather the process is the problem...
Testing the backports are also important, hence before a stable release is made, there is -rc review when testers can test linux-x.y trees as published in linux-stable-rc.git tree. The testing quality depends on how many testers test and send their report, as well as the type of test. For example, I do basic cross-compile test for arm64 and ppc64.
Did they spot the regression I mentioned earlier? Nope, until after the release, where production users reported the regression. If they do, they would have replied to appropriate backported patch that caused the problem.
Thanks.
[1]: https://lore.kernel.org/linux-doc/20230303162553.17212-1-vegard.nossum@oracl...
From: Zhang Qiao zhangqiao22@huawei.com
[ Upstream commit 829c1651e9c4a6f78398d3e67651cef9bb6b42cc ]
When a scheduling entity is placed onto cfs_rq, its vruntime is pulled to the base level (around cfs_rq->min_vruntime), so that the entity doesn't gain extra boost when placed backwards.
However, if the entity being placed wasn't executed for a long time, its vruntime may get too far behind (e.g. while cfs_rq was executing a low-weight hog), which can inverse the vruntime comparison due to s64 overflow. This results in the entity being placed with its original vruntime way forwards, so that it will effectively never get to the cpu.
To prevent that, ignore the vruntime of the entity being placed if it didn't execute for much longer than the characteristic sheduler time scale.
[rkagan: formatted, adjusted commit log, comments, cutoff value] Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Co-developed-by: Roman Kagan rkagan@amazon.de Signed-off-by: Roman Kagan rkagan@amazon.de Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20230130122216.3555094-1-rkagan@amazon.de Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/sched/fair.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2c3d0d49c80ea..a976e80920594 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4640,6 +4640,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; + u64 sleep_time;
/* * The 'current' period is already promised to the current tasks, @@ -4669,8 +4670,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; }
- /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * Pull vruntime of the entity being placed to the base level of + * cfs_rq, to prevent boosting it if placed backwards. If the entity + * slept for a long time, don't even try to compare its vruntime with + * the base as it may be too far off and the comparison may get + * inversed due to s64 overflow. + */ + sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; + if ((s64)sleep_time > 60LL * NSEC_PER_SEC) + se->vruntime = vruntime; + else + se->vruntime = max_vruntime(se->vruntime, vruntime); }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
From: Qu Wenruo wqu@suse.com
[ Upstream commit 28232909ba43561887508a6ef46d7f33a648f375 ]
[BUG] When debugging a scrub related metadata error, it turns out that our metadata error reporting is not ideal.
The only 3 error messages are:
- BTRFS error (device dm-2): bdev /dev/mapper/test-scratch1 errs: wr 0, rd 0, flush 0, corrupt 0, gen 1 Showing we have metadata generation mismatch errors.
- BTRFS error (device dm-2): unable to fixup (regular) error at logical 7110656 on dev /dev/mapper/test-scratch1 Showing which tree blocks are corrupted.
- BTRFS warning (device dm-2): checksum/header error at logical 24772608 on dev /dev/mapper/test-scratch2, physical 3801088: metadata node (level 1) in tree 5 Showing which physical range the corrupted metadata is at.
We have to combine the above 3 to know we have a corrupted metadata with generation mismatch.
And this is already the better case, if we have other problems, like fsid mismatch, we can not even know the cause.
[CAUSE] The problem is caused by the fact that, scrub_checksum_tree_block() never outputs any error message.
It just return two bits for scrub: sblock->header_error, and sblock->generation_error.
And later we report error in scrub_print_warning(), but unfortunately we only have two bits, there is not really much thing we can done to print any detailed errors.
[FIX] This patch will do the following to enhance the error reporting of metadata scrub:
- Add extra warning (ratelimited) for every error we hit This can help us to distinguish the different types of errors. Some errors can help us to know what's going wrong immediately, like bytenr mismatch.
- Re-order the checks Currently we check bytenr first, then immediately generation. This can lead to false generation mismatch reports, while the fsid mismatches.
Here is the new output for the bug I'm debugging (we forgot to writeback tree blocks for commit roots):
BTRFS warning (device dm-2): tree block 24117248 mirror 1 has bad fsid, has b77cd862-f150-4c71-90ec-7baf0544d83f want 17df6abf-23cd-445f-b350-5b3e40bfd2fc BTRFS warning (device dm-2): tree block 24117248 mirror 0 has bad fsid, has b77cd862-f150-4c71-90ec-7baf0544d83f want 17df6abf-23cd-445f-b350-5b3e40bfd2fc
Now we can immediately know it's some tree blocks didn't even get written back, other than the original confusing generation mismatch.
Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/scrub.c | 49 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 196c4c6ed1ed8..c5d8dc112fd58 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2036,20 +2036,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (sblock->logical != btrfs_stack_header_bytenr(h)) + if (sblock->logical != btrfs_stack_header_bytenr(h)) { sblock->header_error = 1; - - if (sector->generation != btrfs_stack_header_generation(h)) { - sblock->header_error = 1; - sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_bytenr(h), + sblock->logical); + goto out; }
- if (!scrub_check_fsid(h->fsid, sector)) + if (!scrub_check_fsid(h->fsid, sector)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->fsid, sblock->dev->fs_devices->fsid); + goto out; + }
- if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, - BTRFS_UUID_SIZE)) + if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + sblock->logical, sblock->mirror_num, + h->chunk_tree_uuid, fs_info->chunk_tree_uuid); + goto out; + }
shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); @@ -2062,9 +2075,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) }
crypto_shash_final(shash, calculated_csum); - if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) + if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { sblock->checksum_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, + sblock->logical, sblock->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + goto out; + } + + if (sector->generation != btrfs_stack_header_generation(h)) { + sblock->header_error = 1; + sblock->generation_error = 1; + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad generation, has %llu want %llu", + sblock->logical, sblock->mirror_num, + btrfs_stack_header_generation(h), + sector->generation); + }
+out: return sblock->header_error || sblock->checksum_error; }
From: Michael Grzeschik m.grzeschik@pengutronix.de
[ Upstream commit 32405e532d358a2f9d4befae928b9883c8597616 ]
Since we need to support legacy phys with the dwc3 controller, we enable this quirk on the zynqmp platforms.
Signed-off-by: Michael Grzeschik m.grzeschik@pengutronix.de Link: https://lore.kernel.org/r/20221023215649.221726-1-m.grzeschik@pengutronix.de Signed-off-by: Michal Simek michal.simek@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/boot/dts/xilinx/zynqmp.dtsi | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi index a549265e55f6e..7c1af75f33a05 100644 --- a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi +++ b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi @@ -825,6 +825,7 @@ dwc3_0: usb@fe200000 { clock-names = "bus_early", "ref"; iommus = <&smmu 0x860>; snps,quirk-frame-length-adjustment = <0x20>; + snps,resume-hs-terminations; /* dma-coherent; */ }; }; @@ -851,6 +852,7 @@ dwc3_1: usb@fe300000 { clock-names = "bus_early", "ref"; iommus = <&smmu 0x861>; snps,quirk-frame-length-adjustment = <0x20>; + snps,resume-hs-terminations; /* dma-coherent; */ }; };
From: Peter Zijlstra peterz@infradead.org
[ Upstream commit 821ad23d0eaff73ef599ece39ecc77482df20a8c ]
Fix instrumentation bugs objtool found:
vmlinux.o: warning: objtool: intel_idle_s2idle+0xd5: call to fpu_idle_fpregs() leaves .noinstr.text section vmlinux.o: warning: objtool: intel_idle_xstate+0x11: call to fpu_idle_fpregs() leaves .noinstr.text section vmlinux.o: warning: objtool: fpu_idle_fpregs+0x9: call to xfeatures_in_use() leaves .noinstr.text section
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Tested-by: Tony Lindgren tony@atomide.com Tested-by: Ulf Hansson ulf.hansson@linaro.org Acked-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Acked-by: Frederic Weisbecker frederic@kernel.org Link: https://lore.kernel.org/r/20230112195540.494977795@infradead.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/include/asm/fpu/xcr.h | 4 ++-- arch/x86/include/asm/special_insns.h | 2 +- arch/x86/kernel/fpu/core.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h index 9656a5bc6feae..9a710c0604457 100644 --- a/arch/x86/include/asm/fpu/xcr.h +++ b/arch/x86/include/asm/fpu/xcr.h @@ -5,7 +5,7 @@ #define XCR_XFEATURE_ENABLED_MASK 0x00000000 #define XCR_XFEATURE_IN_USE_MASK 0x00000001
-static inline u64 xgetbv(u32 index) +static __always_inline u64 xgetbv(u32 index) { u32 eax, edx;
@@ -27,7 +27,7 @@ static inline void xsetbv(u32 index, u64 value) * * Callers should check X86_FEATURE_XGETBV1. */ -static inline u64 xfeatures_in_use(void) +static __always_inline u64 xfeatures_in_use(void) { return xgetbv(XCR_XFEATURE_IN_USE_MASK); } diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 35f709f619fb4..c2e322189f853 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -295,7 +295,7 @@ static inline int enqcmds(void __iomem *dst, const void *src) return 0; }
-static inline void tile_release(void) +static __always_inline void tile_release(void) { /* * Instruction opcode for TILERELEASE; supported in binutils diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 9baa89a8877d0..dccce58201b7c 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -853,12 +853,12 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr) * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ -void fpu_idle_fpregs(void) +noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); - fpregs_deactivate(¤t->thread.fpu); + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } }
From: Peter Zijlstra peterz@infradead.org
[ Upstream commit 69d4c0d3218692ffa56b0e1b9c76c50c699d7044 ]
KASAN cannot just hijack the mem*() functions, it needs to emit __asan_mem*() variants if it wants instrumentation (other sanitizers already do this).
vmlinux.o: warning: objtool: sync_regs+0x24: call to memcpy() leaves .noinstr.text section vmlinux.o: warning: objtool: vc_switch_off_ist+0xbe: call to memcpy() leaves .noinstr.text section vmlinux.o: warning: objtool: fixup_bad_iret+0x36: call to memset() leaves .noinstr.text section vmlinux.o: warning: objtool: __sev_get_ghcb+0xa0: call to memcpy() leaves .noinstr.text section vmlinux.o: warning: objtool: __sev_put_ghcb+0x35: call to memcpy() leaves .noinstr.text section
Remove the weak aliases to ensure nobody hijacks these functions and add them to the noinstr section.
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Tested-by: Tony Lindgren tony@atomide.com Tested-by: Ulf Hansson ulf.hansson@linaro.org Acked-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Acked-by: Frederic Weisbecker frederic@kernel.org Link: https://lore.kernel.org/r/20230112195542.028523143@infradead.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/lib/memcpy_64.S | 5 ++--- arch/x86/lib/memmove_64.S | 4 +++- arch/x86/lib/memset_64.S | 4 +++- mm/kasan/kasan.h | 4 ++++ mm/kasan/shadow.c | 38 ++++++++++++++++++++++++++++++++++++++ tools/objtool/check.c | 3 +++ 6 files changed, 53 insertions(+), 5 deletions(-)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index dd8cd8831251f..a64017602010e 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -8,7 +8,7 @@ #include <asm/alternative.h> #include <asm/export.h>
-.pushsection .noinstr.text, "ax" +.section .noinstr.text, "ax"
/* * We build a jump to memcpy_orig by default which gets NOPped out on @@ -43,7 +43,7 @@ SYM_TYPED_FUNC_START(__memcpy) SYM_FUNC_END(__memcpy) EXPORT_SYMBOL(__memcpy)
-SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy) +SYM_FUNC_ALIAS(memcpy, __memcpy) EXPORT_SYMBOL(memcpy)
/* @@ -184,4 +184,3 @@ SYM_FUNC_START_LOCAL(memcpy_orig) RET SYM_FUNC_END(memcpy_orig)
-.popsection diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 724bbf83eb5b0..02661861e5dd9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -13,6 +13,8 @@
#undef memmove
+.section .noinstr.text, "ax" + /* * Implement memmove(). This can handle overlap between src and dst. * @@ -213,5 +215,5 @@ SYM_FUNC_START(__memmove) SYM_FUNC_END(__memmove) EXPORT_SYMBOL(__memmove)
-SYM_FUNC_ALIAS_WEAK(memmove, __memmove) +SYM_FUNC_ALIAS(memmove, __memmove) EXPORT_SYMBOL(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index fc9ffd3ff3b21..6143b1a6fa2ca 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -6,6 +6,8 @@ #include <asm/alternative.h> #include <asm/export.h>
+.section .noinstr.text, "ax" + /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is @@ -43,7 +45,7 @@ SYM_FUNC_START(__memset) SYM_FUNC_END(__memset) EXPORT_SYMBOL(__memset)
-SYM_FUNC_ALIAS_WEAK(memset, __memset) +SYM_FUNC_ALIAS(memset, __memset) EXPORT_SYMBOL(memset)
/* diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index abbcc1b0eec50..a2a367d2ac809 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -614,6 +614,10 @@ void __asan_set_shadow_f3(const void *addr, size_t size); void __asan_set_shadow_f5(const void *addr, size_t size); void __asan_set_shadow_f8(const void *addr, size_t size);
+void *__asan_memset(void *addr, int c, size_t len); +void *__asan_memmove(void *dest, const void *src, size_t len); +void *__asan_memcpy(void *dest, const void *src, size_t len); + void __hwasan_load1_noabort(unsigned long addr); void __hwasan_store1_noabort(unsigned long addr); void __hwasan_load2_noabort(unsigned long addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 0e3648b603a6f..48c405886f958 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -38,6 +38,12 @@ bool __kasan_check_write(const volatile void *p, unsigned int size) } EXPORT_SYMBOL(__kasan_check_write);
+#ifndef CONFIG_GENERIC_ENTRY +/* + * CONFIG_GENERIC_ENTRY relies on compiler emitted mem*() calls to not be + * instrumented. KASAN enabled toolchains should emit __asan_mem*() functions + * for the sites they want to instrument. + */ #undef memset void *memset(void *addr, int c, size_t len) { @@ -68,6 +74,38 @@ void *memcpy(void *dest, const void *src, size_t len)
return __memcpy(dest, src, len); } +#endif + +void *__asan_memset(void *addr, int c, size_t len) +{ + if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + return NULL; + + return __memset(addr, c, len); +} +EXPORT_SYMBOL(__asan_memset); + +#ifdef __HAVE_ARCH_MEMMOVE +void *__asan_memmove(void *dest, const void *src, size_t len) +{ + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memmove(dest, src, len); +} +EXPORT_SYMBOL(__asan_memmove); +#endif + +void *__asan_memcpy(void *dest, const void *src, size_t len) +{ + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memcpy(dest, src, len); +} +EXPORT_SYMBOL(__asan_memcpy);
void kasan_poison(const void *addr, size_t size, u8 value, bool init) { diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 51494c3002d91..1252c06f42b3b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -955,6 +955,9 @@ static const char *uaccess_safe_builtin[] = { "__asan_store16_noabort", "__kasan_check_read", "__kasan_check_write", + "__asan_memset", + "__asan_memmove", + "__asan_memcpy", /* KASAN in-line */ "__asan_report_load_n_noabort", "__asan_report_load1_noabort",
From: Jens Axboe axboe@kernel.dk
[ Upstream commit cb3ea4b7671b7cfbac3ee609976b790aebd0bbda ]
We don't set it on PF_KTHREAD threads as they never return to userspace, and PF_IO_WORKER threads are identical in that regard. As they keep running in the kernel until they die, skip setting the FPU flag on them.
More of a cosmetic thing that was found while debugging and issue and pondering why the FPU flag is set on these threads.
Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Ingo Molnar mingo@kernel.org Acked-by: Peter Zijlstra peterz@infradead.org Link: https://lore.kernel.org/r/560c844c-f128-555b-40c6-31baff27537f@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/kernel/fpu/context.h | 2 +- arch/x86/kernel/fpu/core.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index b2486b2cbc6e0..c2d6cd78ed0c2 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -39,7 +39,7 @@ extern void fpu_flush_thread(void); static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && - !(current->flags & PF_KTHREAD)) { + !(current->flags & (PF_KTHREAD | PF_IO_WORKER))) { save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index 958accf2ccf07..9fcfa5c4dad79 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -57,7 +57,7 @@ static inline void fpregs_restore_userregs(void) struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id();
- if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) + if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_IO_WORKER))) return;
if (!fpregs_state_valid(fpu, cpu)) { diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index dccce58201b7c..caf33486dc5ee 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -426,7 +426,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
this_cpu_write(in_kernel_fpu, true);
- if (!(current->flags & PF_KTHREAD) && + if (!(current->flags & (PF_KTHREAD | PF_IO_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(¤t->thread.fpu);
From: Mark Rutland mark.rutland@arm.com
[ Upstream commit 393e2ea30aec634b37004d401863428e120d5e1b ]
The PSCI suspend code is currently instrumentable, which is not safe as instrumentation (e.g. ftrace) may try to make use of RCU during idle periods when RCU is not watching.
To fix this we need to ensure that psci_suspend_finisher() and anything it calls are not instrumented. We can do this fairly simply by marking psci_suspend_finisher() and the psci*_cpu_suspend() functions as noinstr, and the underlying helper functions as __always_inline.
When CONFIG_DEBUG_VIRTUAL=y, __pa_symbol() can expand to an out-of-line instrumented function, so we must use __pa_symbol_nodebug() within psci_suspend_finisher().
The raw SMCCC invocation functions are written in assembly, and are not subject to compiler instrumentation.
Signed-off-by: Mark Rutland mark.rutland@arm.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Link: https://lore.kernel.org/r/20230126151323.349423061@infradead.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/firmware/psci/psci.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-)
diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index 447ee4ea5c903..f78249fe2512a 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -108,9 +108,10 @@ bool psci_power_state_is_valid(u32 state) return !(state & ~valid_mask); }
-static unsigned long __invoke_psci_fn_hvc(unsigned long function_id, - unsigned long arg0, unsigned long arg1, - unsigned long arg2) +static __always_inline unsigned long +__invoke_psci_fn_hvc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2) { struct arm_smccc_res res;
@@ -118,9 +119,10 @@ static unsigned long __invoke_psci_fn_hvc(unsigned long function_id, return res.a0; }
-static unsigned long __invoke_psci_fn_smc(unsigned long function_id, - unsigned long arg0, unsigned long arg1, - unsigned long arg2) +static __always_inline unsigned long +__invoke_psci_fn_smc(unsigned long function_id, + unsigned long arg0, unsigned long arg1, + unsigned long arg2) { struct arm_smccc_res res;
@@ -128,7 +130,7 @@ static unsigned long __invoke_psci_fn_smc(unsigned long function_id, return res.a0; }
-static int psci_to_linux_errno(int errno) +static __always_inline int psci_to_linux_errno(int errno) { switch (errno) { case PSCI_RET_SUCCESS: @@ -169,7 +171,8 @@ int psci_set_osi_mode(bool enable) return psci_to_linux_errno(err); }
-static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) +static __always_inline int +__psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) { int err;
@@ -177,13 +180,15 @@ static int __psci_cpu_suspend(u32 fn, u32 state, unsigned long entry_point) return psci_to_linux_errno(err); }
-static int psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) +static __always_inline int +psci_0_1_cpu_suspend(u32 state, unsigned long entry_point) { return __psci_cpu_suspend(psci_0_1_function_ids.cpu_suspend, state, entry_point); }
-static int psci_0_2_cpu_suspend(u32 state, unsigned long entry_point) +static __always_inline int +psci_0_2_cpu_suspend(u32 state, unsigned long entry_point) { return __psci_cpu_suspend(PSCI_FN_NATIVE(0_2, CPU_SUSPEND), state, entry_point); @@ -450,10 +455,12 @@ late_initcall(psci_debugfs_init) #endif
#ifdef CONFIG_CPU_IDLE -static int psci_suspend_finisher(unsigned long state) +static noinstr int psci_suspend_finisher(unsigned long state) { u32 power_state = state; - phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume); + phys_addr_t pa_cpu_resume; + + pa_cpu_resume = __pa_symbol_nodebug((unsigned long)cpu_resume);
return psci_ops.cpu_suspend(power_state, pa_cpu_resume); }
From: Peter Zijlstra peterz@infradead.org
[ Upstream commit 5a5d7e9badd2cb8065db171961bd30bd3595e4b6 ]
In order to avoid WARN/BUG from generating nested or even recursive warnings, force rcu_is_watching() true during WARN/lockdep_rcu_suspicious().
Notably things like unwinding the stack can trigger rcu_dereference() warnings, which then triggers more unwinding which then triggers more warnings etc..
Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Link: https://lore.kernel.org/r/20230126151323.408156109@infradead.org Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/context_tracking.h | 27 +++++++++++++++++++++++++++ kernel/locking/lockdep.c | 3 +++ kernel/panic.c | 5 +++++ lib/bug.c | 15 ++++++++++++++- 4 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index dcef4a9e4d63e..d4afa8508a806 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -130,9 +130,36 @@ static __always_inline unsigned long ct_state_inc(int incby) return arch_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state)); }
+static __always_inline bool warn_rcu_enter(void) +{ + bool ret = false; + + /* + * Horrible hack to shut up recursive RCU isn't watching fail since + * lots of the actual reporting also relies on RCU. + */ + preempt_disable_notrace(); + if (rcu_dynticks_curr_cpu_in_eqs()) { + ret = true; + ct_state_inc(RCU_DYNTICKS_IDX); + } + + return ret; +} + +static __always_inline void warn_rcu_exit(bool rcu) +{ + if (rcu) + ct_state_inc(RCU_DYNTICKS_IDX); + preempt_enable_notrace(); +} + #else static inline void ct_idle_enter(void) { } static inline void ct_idle_exit(void) { } + +static __always_inline bool warn_rcu_enter(void) { return false; } +static __always_inline void warn_rcu_exit(bool rcu) { } #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */
#endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e3375bc40dadc..50d4863974e7a 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -55,6 +55,7 @@ #include <linux/rcupdate.h> #include <linux/kprobes.h> #include <linux/lockdep.h> +#include <linux/context_tracking.h>
#include <asm/sections.h>
@@ -6555,6 +6556,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; int dl = READ_ONCE(debug_locks); + bool rcu = warn_rcu_enter();
/* Note: the following can be executed concurrently, so be careful. */ pr_warn("\n"); @@ -6595,5 +6597,6 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) lockdep_print_held_locks(curr); pr_warn("\nstack backtrace:\n"); dump_stack(); + warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/panic.c b/kernel/panic.c index 7834c9854e026..5864f356ee576 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,6 +33,7 @@ #include <linux/ratelimit.h> #include <linux/debugfs.h> #include <linux/sysfs.h> +#include <linux/context_tracking.h> #include <trace/events/error_report.h> #include <asm/sections.h>
@@ -678,6 +679,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, void warn_slowpath_fmt(const char *file, int line, unsigned taint, const char *fmt, ...) { + bool rcu = warn_rcu_enter(); struct warn_args args;
pr_warn(CUT_HERE); @@ -692,11 +694,13 @@ void warn_slowpath_fmt(const char *file, int line, unsigned taint, va_start(args.args, fmt); __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(warn_slowpath_fmt); #else void __warn_printk(const char *fmt, ...) { + bool rcu = warn_rcu_enter(); va_list args;
pr_warn(CUT_HERE); @@ -704,6 +708,7 @@ void __warn_printk(const char *fmt, ...) va_start(args, fmt); vprintk(fmt, args); va_end(args); + warn_rcu_exit(rcu); } EXPORT_SYMBOL(__warn_printk); #endif diff --git a/lib/bug.c b/lib/bug.c index c223a2575b721..e0ff219899902 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -47,6 +47,7 @@ #include <linux/sched.h> #include <linux/rculist.h> #include <linux/ftrace.h> +#include <linux/context_tracking.h>
extern struct bug_entry __start___bug_table[], __stop___bug_table[];
@@ -153,7 +154,7 @@ struct bug_entry *find_bug(unsigned long bugaddr) return module_find_bug(bugaddr); }
-enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) +static enum bug_trap_type __report_bug(unsigned long bugaddr, struct pt_regs *regs) { struct bug_entry *bug; const char *file; @@ -209,6 +210,18 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) return BUG_TRAP_TYPE_BUG; }
+enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) +{ + enum bug_trap_type ret; + bool rcu = false; + + rcu = warn_rcu_enter(); + ret = __report_bug(bugaddr, regs); + warn_rcu_exit(rcu); + + return ret; +} + static void clear_once_table(struct bug_entry *start, struct bug_entry *end) { struct bug_entry *bug;
From: Kan Liang kan.liang@linux.intel.com
[ Upstream commit c828441f21ddc819a28b5723a72e3c840e9de1c6 ]
The uncore subsystem for Meteor Lake is similar to the previous Alder Lake. The main difference is that MTL provides PMU support for different tiles, while ADL only provides PMU support for the whole package. On ADL, there are CBOX, ARB, and clockbox uncore PMON units. On MTL, they are split into CBOX/HAC_CBOX, ARB/HAC_ARB, and cncu/sncu which provides a fixed counter for clockticks. Also, new MSR addresses are introduced on MTL.
The IMC uncore PMON is the same as Alder Lake. Add new PCIIDs of IMC for Meteor Lake.
Signed-off-by: Kan Liang kan.liang@linux.intel.com Signed-off-by: Ingo Molnar mingo@kernel.org Link: https://lore.kernel.org/r/20230210190238.1726237-1-kan.liang@linux.intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/events/intel/uncore.c | 7 ++ arch/x86/events/intel/uncore.h | 1 + arch/x86/events/intel/uncore_snb.c | 161 +++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+)
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 459b1aafd4d4a..27b34f5b87600 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1765,6 +1765,11 @@ static const struct intel_uncore_init_fun adl_uncore_init __initconst = { .mmio_init = adl_uncore_mmio_init, };
+static const struct intel_uncore_init_fun mtl_uncore_init __initconst = { + .cpu_init = mtl_uncore_cpu_init, + .mmio_init = adl_uncore_mmio_init, +}; + static const struct intel_uncore_init_fun icx_uncore_init __initconst = { .cpu_init = icx_uncore_cpu_init, .pci_init = icx_uncore_pci_init, @@ -1832,6 +1837,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &mtl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &mtl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index b363fddc2a89e..b74e352910f45 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -587,6 +587,7 @@ void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); void tgl_uncore_cpu_init(void); void adl_uncore_cpu_init(void); +void mtl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); void adl_uncore_mmio_init(void); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 1f4869227efb9..7fd4334e12a17 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -109,6 +109,19 @@ #define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728 #define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729 #define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A +#define PCI_DEVICE_ID_INTEL_MTL_1_IMC 0x7d00 +#define PCI_DEVICE_ID_INTEL_MTL_2_IMC 0x7d01 +#define PCI_DEVICE_ID_INTEL_MTL_3_IMC 0x7d02 +#define PCI_DEVICE_ID_INTEL_MTL_4_IMC 0x7d05 +#define PCI_DEVICE_ID_INTEL_MTL_5_IMC 0x7d10 +#define PCI_DEVICE_ID_INTEL_MTL_6_IMC 0x7d14 +#define PCI_DEVICE_ID_INTEL_MTL_7_IMC 0x7d15 +#define PCI_DEVICE_ID_INTEL_MTL_8_IMC 0x7d16 +#define PCI_DEVICE_ID_INTEL_MTL_9_IMC 0x7d21 +#define PCI_DEVICE_ID_INTEL_MTL_10_IMC 0x7d22 +#define PCI_DEVICE_ID_INTEL_MTL_11_IMC 0x7d23 +#define PCI_DEVICE_ID_INTEL_MTL_12_IMC 0x7d24 +#define PCI_DEVICE_ID_INTEL_MTL_13_IMC 0x7d28
#define IMC_UNCORE_DEV(a) \ @@ -205,6 +218,32 @@ #define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0 #define ADL_UNC_ARB_MSR_OFFSET 0x8
+/* MTL Cbo register */ +#define MTL_UNC_CBO_0_PER_CTR0 0x2448 +#define MTL_UNC_CBO_0_PERFEVTSEL0 0x2442 + +/* MTL HAC_ARB register */ +#define MTL_UNC_HAC_ARB_CTR 0x2018 +#define MTL_UNC_HAC_ARB_CTRL 0x2012 + +/* MTL ARB register */ +#define MTL_UNC_ARB_CTR 0x2418 +#define MTL_UNC_ARB_CTRL 0x2412 + +/* MTL cNCU register */ +#define MTL_UNC_CNCU_FIXED_CTR 0x2408 +#define MTL_UNC_CNCU_FIXED_CTRL 0x2402 +#define MTL_UNC_CNCU_BOX_CTL 0x240e + +/* MTL sNCU register */ +#define MTL_UNC_SNCU_FIXED_CTR 0x2008 +#define MTL_UNC_SNCU_FIXED_CTRL 0x2002 +#define MTL_UNC_SNCU_BOX_CTL 0x200e + +/* MTL HAC_CBO register */ +#define MTL_UNC_HBO_CTR 0x2048 +#define MTL_UNC_HBO_CTRL 0x2042 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11"); @@ -598,6 +637,115 @@ void adl_uncore_cpu_init(void) uncore_msr_uncores = adl_msr_uncores; }
+static struct intel_uncore_type mtl_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .perf_ctr_bits = 48, + .perf_ctr = MTL_UNC_CBO_0_PER_CTR0, + .event_ctl = MTL_UNC_CBO_0_PERFEVTSEL0, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .ops = &icl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + +static struct intel_uncore_type mtl_uncore_hac_arb = { + .name = "hac_arb", + .num_counters = 2, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = MTL_UNC_HAC_ARB_CTR, + .event_ctl = MTL_UNC_HAC_ARB_CTRL, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .ops = &icl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + +static struct intel_uncore_type mtl_uncore_arb = { + .name = "arb", + .num_counters = 2, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = MTL_UNC_ARB_CTR, + .event_ctl = MTL_UNC_ARB_CTRL, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .ops = &icl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + +static struct intel_uncore_type mtl_uncore_hac_cbox = { + .name = "hac_cbox", + .num_counters = 2, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = MTL_UNC_HBO_CTR, + .event_ctl = MTL_UNC_HBO_CTRL, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .ops = &icl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + +static void mtl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(uncore_msr_box_ctl(box), SNB_UNC_GLOBAL_CTL_EN); +} + +static struct intel_uncore_ops mtl_uncore_msr_ops = { + .init_box = mtl_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type mtl_uncore_cncu = { + .name = "cncu", + .num_counters = 1, + .num_boxes = 1, + .box_ctl = MTL_UNC_CNCU_BOX_CTL, + .fixed_ctr_bits = 48, + .fixed_ctr = MTL_UNC_CNCU_FIXED_CTR, + .fixed_ctl = MTL_UNC_CNCU_FIXED_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_CTL_EV_SEL_MASK, + .format_group = &icl_uncore_clock_format_group, + .ops = &mtl_uncore_msr_ops, + .event_descs = icl_uncore_events, +}; + +static struct intel_uncore_type mtl_uncore_sncu = { + .name = "sncu", + .num_counters = 1, + .num_boxes = 1, + .box_ctl = MTL_UNC_SNCU_BOX_CTL, + .fixed_ctr_bits = 48, + .fixed_ctr = MTL_UNC_SNCU_FIXED_CTR, + .fixed_ctl = MTL_UNC_SNCU_FIXED_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_CTL_EV_SEL_MASK, + .format_group = &icl_uncore_clock_format_group, + .ops = &mtl_uncore_msr_ops, + .event_descs = icl_uncore_events, +}; + +static struct intel_uncore_type *mtl_msr_uncores[] = { + &mtl_uncore_cbox, + &mtl_uncore_hac_arb, + &mtl_uncore_arb, + &mtl_uncore_hac_cbox, + &mtl_uncore_cncu, + &mtl_uncore_sncu, + NULL +}; + +void mtl_uncore_cpu_init(void) +{ + mtl_uncore_cbox.num_boxes = icl_get_cbox_num(); + uncore_msr_uncores = mtl_msr_uncores; +} + enum { SNB_PCI_UNCORE_IMC, }; @@ -1264,6 +1412,19 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = { IMC_UNCORE_DEV(RPL_23), IMC_UNCORE_DEV(RPL_24), IMC_UNCORE_DEV(RPL_25), + IMC_UNCORE_DEV(MTL_1), + IMC_UNCORE_DEV(MTL_2), + IMC_UNCORE_DEV(MTL_3), + IMC_UNCORE_DEV(MTL_4), + IMC_UNCORE_DEV(MTL_5), + IMC_UNCORE_DEV(MTL_6), + IMC_UNCORE_DEV(MTL_7), + IMC_UNCORE_DEV(MTL_8), + IMC_UNCORE_DEV(MTL_9), + IMC_UNCORE_DEV(MTL_10), + IMC_UNCORE_DEV(MTL_11), + IMC_UNCORE_DEV(MTL_12), + IMC_UNCORE_DEV(MTL_13), { /* end: all zeroes */ } };
linux-stable-mirror@lists.linaro.org