- Linux-stable-mirror - lists.linaro.org

[PATCH v2 5/6] PCI: hv: Add a per-bus mutex state_lock

by Dexuan Cui

In the case of fast device addition/removal, it's possible that hv_eject_device_work() can start to run before create_root_hv_pci_bus() starts to run; as a result, the pci_get_domain_bus_and_slot() in hv_eject_device_work() can return a 'pdev' of NULL, and hv_eject_device_work() can remove the 'hpdev', and immediately send a message PCI_EJECTION_COMPLETE to the host, and the host immediately unassigns the PCI device from the guest; meanwhile, create_root_hv_pci_bus() and the PCI device driver can be probing the dead PCI device and reporting timeout errors. Fix the issue by adding a per-bus mutex 'state_lock' and grabbing the mutex before powering on the PCI bus in hv_pci_enter_d0(): when hv_eject_device_work() starts to run, it's able to find the 'pdev' and call pci_stop_and_remove_bus_device(pdev): if the PCI device driver has loaded, the PCI device driver's probe() function is already called in create_root_hv_pci_bus() -> pci_bus_add_devices(), and now hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to call the PCI device driver's remove() function and remove the device reliably; if the PCI device driver hasn't loaded yet, the function call hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to remove the PCI device reliably and the PCI device driver's probe() function won't be called; if the PCI device driver's probe() is already running (e.g., systemd-udev is loading the PCI device driver), it must be holding the per-device lock, and after the probe() finishes and releases the lock, hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to proceed to remove the device reliably. Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Dexuan Cui <decui(a)microsoft.com> Cc: stable(a)vger.kernel.org --- v2: Removed the "debug code". Fixed the "goto out" in hv_pci_resume() [Michael Kelley] Added Cc:stable drivers/pci/controller/pci-hyperv.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 48feab095a144..3ae2f99dea8c2 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -489,7 +489,10 @@ struct hv_pcibus_device { struct fwnode_handle *fwnode; /* Protocol version negotiated with the host */ enum pci_protocol_version_t protocol_version; + + struct mutex state_lock; enum hv_pcibus_state state; + struct hv_device *hdev; resource_size_t low_mmio_space; resource_size_t high_mmio_space; @@ -2512,6 +2515,8 @@ static void pci_devices_present_work(struct work_struct *work) if (!dr) return; + mutex_lock(&hbus->state_lock); + /* First, mark all existing children as reported missing. */ spin_lock_irqsave(&hbus->device_list_lock, flags); list_for_each_entry(hpdev, &hbus->children, list_entry) { @@ -2593,6 +2598,8 @@ static void pci_devices_present_work(struct work_struct *work) break; } + mutex_unlock(&hbus->state_lock); + kfree(dr); } @@ -2741,6 +2748,8 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; + mutex_lock(&hbus->state_lock); + /* * Ejection can come before or after the PCI bus has been set up, so * attempt to find it and tear down the bus state, if it exists. This @@ -2777,6 +2786,8 @@ static void hv_eject_device_work(struct work_struct *work) put_pcichild(hpdev); put_pcichild(hpdev); /* hpdev has been freed. Do not use it any more. */ + + mutex_unlock(&hbus->state_lock); } /** @@ -3562,6 +3573,7 @@ static int hv_pci_probe(struct hv_device *hdev, return -ENOMEM; hbus->bridge = bridge; + mutex_init(&hbus->state_lock); hbus->state = hv_pcibus_init; hbus->wslot_res_allocated = -1; @@ -3670,9 +3682,11 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_irq_domain; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto free_irq_domain; + goto release_state_lock; ret = hv_pci_allocate_bridge_windows(hbus); if (ret) @@ -3690,12 +3704,15 @@ static int hv_pci_probe(struct hv_device *hdev, if (ret) goto free_windows; + mutex_unlock(&hbus->state_lock); return 0; free_windows: hv_pci_free_bridge_windows(hbus); exit_d0: (void) hv_pci_bus_exit(hdev, true); +release_state_lock: + mutex_unlock(&hbus->state_lock); free_irq_domain: irq_domain_remove(hbus->irq_domain); free_fwnode: @@ -3945,20 +3962,26 @@ static int hv_pci_resume(struct hv_device *hdev) if (ret) goto out; + mutex_lock(&hbus->state_lock); + ret = hv_pci_enter_d0(hdev); if (ret) - goto out; + goto release_state_lock; ret = hv_send_resources_allocated(hdev); if (ret) - goto out; + goto release_state_lock; prepopulate_bars(hbus); hv_pci_restore_msi_state(hbus); hbus->state = hv_pcibus_installed; + mutex_unlock(&hbus->state_lock); return 0; + +release_state_lock: + mutex_unlock(&hbus->state_lock); out: vmbus_close(hdev->channel); return ret; -- 2.25.1

2 years, 7 months

2
1
0 0

[PATCH v2 3/6] PCI: hv: Remove the useless hv_pcichild_state from struct hv_pci_dev

by Dexuan Cui

The hpdev->state is never really useful. The only use in hv_pci_eject_device() and hv_eject_device_work() is not really necessary. Signed-off-by: Dexuan Cui <decui(a)microsoft.com> Cc: stable(a)vger.kernel.org --- drivers/pci/controller/pci-hyperv.c | 12 ------------ 1 file changed, 12 deletions(-) v2: No change to the patch body. Added Cc:stable diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 1b11cf7391933..46df6d093d683 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -553,19 +553,10 @@ struct hv_dr_state { struct hv_pcidev_description func[]; }; -enum hv_pcichild_state { - hv_pcichild_init = 0, - hv_pcichild_requirements, - hv_pcichild_resourced, - hv_pcichild_ejecting, - hv_pcichild_maximum -}; - struct hv_pci_dev { /* List protected by pci_rescan_remove_lock */ struct list_head list_entry; refcount_t refs; - enum hv_pcichild_state state; struct pci_slot *pci_slot; struct hv_pcidev_description desc; bool reported_missing; @@ -2750,8 +2741,6 @@ static void hv_eject_device_work(struct work_struct *work) hpdev = container_of(work, struct hv_pci_dev, wrk); hbus = hpdev->hbus; - WARN_ON(hpdev->state != hv_pcichild_ejecting); - /* * Ejection can come before or after the PCI bus has been set up, so * attempt to find it and tear down the bus state, if it exists. This @@ -2808,7 +2797,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev) return; } - hpdev->state = hv_pcichild_ejecting; get_pcichild(hpdev); INIT_WORK(&hpdev->wrk, hv_eject_device_work); queue_work(hbus->wq, &hpdev->wrk); -- 2.25.1

2 years, 7 months

2
1
0 0

[PATCH v2 2/6] PCI: hv: Fix a race condition in hv_irq_unmask() that can cause panic

by Dexuan Cui

When the host tries to remove a PCI device, the host first sends a PCI_EJECT message to the guest, and the guest is supposed to gracefully remove the PCI device and send a PCI_EJECTION_COMPLETE message to the host; the host then sends a VMBus message CHANNELMSG_RESCIND_CHANNELOFFER to the guest (when the guest receives this message, the device is already unassigned from the guest) and the guest can do some final cleanup work; if the guest fails to respond to the PCI_EJECT message within one minute, the host sends the VMBus message CHANNELMSG_RESCIND_CHANNELOFFER and removes the PCI device forcibly. In the case of fast device addition/removal, it's possible that the PCI device driver is still configuring MSI-X interrupts when the guest receives the PCI_EJECT message; the channel callback calls hv_pci_eject_device(), which sets hpdev->state to hv_pcichild_ejecting, and schedules a work hv_eject_device_work(); if the PCI device driver is calling pci_alloc_irq_vectors() -> ... -> hv_compose_msi_msg(), we can break the while loop in hv_compose_msi_msg() due to the updated hpdev->state, and leave data->chip_data with its default value of NULL; later, when the PCI device driver calls request_irq() -> ... -> hv_irq_unmask(), the guest crashes in hv_arch_irq_unmask() due to data->chip_data being NULL. Fix the issue by not testing hpdev->state in the while loop: when the guest receives PCI_EJECT, the device is still assigned to the guest, and the guest has one minute to finish the device removal gracefully. We don't really need to (and we should not) test hpdev->state in the loop. Fixes: de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()") Signed-off-by: Dexuan Cui <decui(a)microsoft.com> Cc: stable(a)vger.kernel.org --- v2: Removed the "debug code". No change to the patch body. Added Cc:stable drivers/pci/controller/pci-hyperv.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index b82c7cde19e66..1b11cf7391933 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -643,6 +643,11 @@ static void hv_arch_irq_unmask(struct irq_data *data) pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); int_desc = data->chip_data; + if (!int_desc) { + dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n", + __func__, data->irq); + return; + } spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); @@ -1911,12 +1916,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) hv_pci_onchannelcallback(hbus); spin_unlock_irqrestore(&channel->sched_lock, flags); - if (hpdev->state == hv_pcichild_ejecting) { - dev_err_once(&hbus->hdev->device, - "the device is being ejected\n"); - goto enable_tasklet; - } - udelay(100); } -- 2.25.1

2 years, 7 months

2
1
0 0

[PATCH v2 1/6] PCI: hv: Fix a race condition bug in hv_pci_query_relations()

by Dexuan Cui

Fix the longstanding race between hv_pci_query_relations() and survey_child_resources() by flushing the workqueue before we exit from hv_pci_query_relations(). Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Dexuan Cui <decui(a)microsoft.com> Cc: stable(a)vger.kernel.org --- v2: Removed the "debug code". No change to the patch body. Added Cc:stable drivers/pci/controller/pci-hyperv.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index f33370b756283..b82c7cde19e66 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3308,6 +3308,19 @@ static int hv_pci_query_relations(struct hv_device *hdev) if (!ret) ret = wait_for_response(hdev, &comp); + /* + * In the case of fast device addition/removal, it's possible that + * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we + * already got a PCI_BUS_RELATIONS* message from the host and the + * channel callback already scheduled a work to hbus->wq, which can be + * running survey_child_resources() -> complete(&hbus->survey_event), + * even after hv_pci_query_relations() exits and the stack variable + * 'comp' is no longer valid. This can cause a strange hang issue + * or sometimes a page fault. Flush hbus->wq before we exit from + * hv_pci_query_relations() to avoid the issues. + */ + flush_workqueue(hbus->wq); + return ret; } -- 2.25.1

2 years, 7 months

2
1
0 0

[PATCH net] net: phy: nxp-c45-tja11xx: disable port and global interrupts

by Radu Pirea (OSS)

Disabling only the link event irq is not enough to disable the interrupts. PTP will still be able to generate interrupts. The interrupts are organised in a tree on the C45 TJA11XX PHYs. To completely disable the interrupts, they are disable from the top of the interrupt tree. Fixes: 514def5dd339 ("phy: nxp-c45-tja11xx: add timestamping support") CC: stable(a)vger.kernel.org # 5.15+ Signed-off-by: Radu Pirea (OSS) <radu-nicolae.pirea(a)oss.nxp.com> --- drivers/net/phy/nxp-c45-tja11xx.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 029875a59ff8..ce718a5865a4 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -31,6 +31,10 @@ #define DEVICE_CONTROL_CONFIG_GLOBAL_EN BIT(14) #define DEVICE_CONTROL_CONFIG_ALL_EN BIT(13) +#define VEND1_PORT_IRQ_ENABLES 0x0072 +#define PORT1_IRQ BIT(1) +#define GLOBAL_IRQ BIT(0) + #define VEND1_PHY_IRQ_ACK 0x80A0 #define VEND1_PHY_IRQ_EN 0x80A1 #define VEND1_PHY_IRQ_STATUS 0x80A2 @@ -235,7 +239,7 @@ struct nxp_c45_phy_stats { u16 mask; }; -static bool nxp_c45_poll_txts(struct phy_device *phydev) +static bool nxp_c45_poll(struct phy_device *phydev) { return phydev->irq <= 0; } @@ -448,7 +452,7 @@ static void nxp_c45_process_txts(struct nxp_c45_phy *priv, static long nxp_c45_do_aux_work(struct ptp_clock_info *ptp) { struct nxp_c45_phy *priv = container_of(ptp, struct nxp_c45_phy, caps); - bool poll_txts = nxp_c45_poll_txts(priv->phydev); + bool poll_txts = nxp_c45_poll(priv->phydev); struct skb_shared_hwtstamps *shhwtstamps_rx; struct ptp_clock_event event; struct nxp_c45_hwts hwts; @@ -699,7 +703,7 @@ static void nxp_c45_txtstamp(struct mii_timestamper *mii_ts, NXP_C45_SKB_CB(skb)->header = ptp_parse_header(skb, type); skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; skb_queue_tail(&priv->tx_queue, skb); - if (nxp_c45_poll_txts(priv->phydev)) + if (nxp_c45_poll(priv->phydev)) ptp_schedule_worker(priv->ptp_clock, 0); break; case HWTSTAMP_TX_OFF: @@ -772,7 +776,7 @@ static int nxp_c45_hwtstamp(struct mii_timestamper *mii_ts, PORT_PTP_CONTROL_BYPASS); } - if (nxp_c45_poll_txts(priv->phydev)) + if (nxp_c45_poll(priv->phydev)) goto nxp_c45_no_ptp_irq; if (priv->hwts_tx) @@ -892,10 +896,12 @@ static int nxp_c45_config_intr(struct phy_device *phydev) { if (phydev->interrupts == PHY_INTERRUPT_ENABLED) return phy_set_bits_mmd(phydev, MDIO_MMD_VEND1, - VEND1_PHY_IRQ_EN, PHY_IRQ_LINK_EVENT); + VEND1_PORT_IRQ_ENABLES, + PORT1_IRQ | GLOBAL_IRQ); else return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND1, - VEND1_PHY_IRQ_EN, PHY_IRQ_LINK_EVENT); + VEND1_PORT_IRQ_ENABLES, + PORT1_IRQ | GLOBAL_IRQ); } static irqreturn_t nxp_c45_handle_interrupt(struct phy_device *phydev) @@ -1290,6 +1296,10 @@ static int nxp_c45_config_init(struct phy_device *phydev) phy_set_bits_mmd(phydev, MDIO_MMD_VEND1, VEND1_PORT_FUNC_ENABLES, PTP_ENABLE); + if (!nxp_c45_poll(phydev)) + phy_set_bits_mmd(phydev, MDIO_MMD_VEND1, + VEND1_PHY_IRQ_EN, PHY_IRQ_LINK_EVENT); + return nxp_c45_start_op(phydev); } -- 2.34.1

2 years, 7 months

2
2
0 0

[PATCH v2] purgatory: fix disabling debug info

by Alyssa Ross

Since 32ef9e5054ec, -Wa,-gdwarf-2 is no longer used in KBUILD_AFLAGS. Instead, it includes -g, the appropriate -gdwarf-* flag, and also the -Wa versions of both of those if building with Clang and GNU as. As a result, debug info was being generated for the purgatory objects, even though the intention was that it not be. Fixes: 32ef9e5054ec ("Makefile.debug: re-enable debug info for .S files") Signed-off-by: Alyssa Ross <hi(a)alyssa.is> Cc: stable(a)vger.kernel.org --- Difference from v2: replace each AFLAGS_REMOVE_* assignment with a single aflags-remove-y line, and use foreach to add the -Wa versions, as suggested by Masahiro Yamada. arch/riscv/purgatory/Makefile | 7 +------ arch/x86/purgatory/Makefile | 3 +-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index d16bf715a586..5730797a6b40 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -84,12 +84,7 @@ CFLAGS_string.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_ctype.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_ctype.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_entry.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_memcpy.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_memset.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strcmp.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strlen.o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_strncmp.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 17f09dc26381..82fec66d46d2 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -69,8 +69,7 @@ CFLAGS_sha256.o += $(PURGATORY_CFLAGS) CFLAGS_REMOVE_string.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_string.o += $(PURGATORY_CFLAGS) -AFLAGS_REMOVE_setup-x86_$(BITS).o += -Wa,-gdwarf-2 -AFLAGS_REMOVE_entry64.o += -Wa,-gdwarf-2 +asflags-remove-y += $(foreach x, -g -gdwarf-4 -gdwarf-5, $(x) -Wa,$(x)) $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE $(call if_changed,ld) base-commit: da8e7da11e4ba758caf4c149cc8d8cd555aefe5f -- 2.37.1

2 years, 7 months

3
3
0 0

6.1.22: Resume from hibernate fails; bisected

by Rainer Fiebig

Hi! Since kernel 6.1.22 starting a resume from hibernate by hitting a key on the keyboard fails. However, if the PC was switched off and on again (or reset), the resume is OK. The APU is a Ryzen 5600G. Bisecting between 6.1.21/22 turned up this: Author: Tim Huang <tim.huang(a)amd.com> Date: Thu Mar 9 16:27:51 2023 +0800 drm/amdgpu: skip ASIC reset for APUs when go to S4 commit b589626674de94d977e81c99bf7905872b991197 upstream. For GC IP v11.0.4/11, PSP TMR need to be reserved for ASIC mode2 reset. But for S4, when psp suspend, it will destroy the TMR that fails the ASIC reset. [...] Reverting the commit solves the problem. Thanks. Rainer Fiebig -- The truth always turns out to be simpler than you thought. Richard Feynman

2 years, 7 months

3
5
0 0

Re: 6.1.22: Resume from hibernate fails; bisected

by Rainer Fiebig

Am 07.04.23 um 05:40 schrieb Huang, Tim: > [AMD Official Use Only - General] > > On Thu, Apr 06, 2023 at 05:39:07PM +0200, Rainer Fiebig wrote: >> Am 06.04.23 um 15:30 schrieb Linux regression tracking (Thorsten Leemhuis): >>> [CCing the regression list, as it should be in the loop for regressions: >>> https://docs.kernel.org/admin-guide/reporting-regressions.html] >>> >>> On 06.04.23 14:06, Rainer Fiebig wrote: >>>> Hi! Since kernel 6.1.22 starting a resume from hibernate by hitting a >>>> key on the keyboard fails. However, if the PC was switched off and on >>>> again (or reset), the resume is OK. The APU is a Ryzen 5600G. >>>> >>>> Bisecting between 6.1.21/22 turned up this: >>>> >>>> >>>> Author: Tim Huang <tim.huang(a)amd.com> >>>> Date: Thu Mar 9 16:27:51 2023 +0800 >>>> >>>> drm/amdgpu: skip ASIC reset for APUs when go to S4 >>>> >>>> commit b589626674de94d977e81c99bf7905872b991197 upstream. >>>> >>>> For GC IP v11.0.4/11, PSP TMR need to be reserved >>>> for ASIC mode2 reset. But for S4, when psp suspend, >>>> it will destroy the TMR that fails the ASIC reset. >>>> [...] >>>> >>>> >>>> Reverting the commit solves the problem. >>>> Thanks. >>> >>> Please try 6.1.23 and report back, because from the thread >>> https://lore.kernel.org/all/20230330160740.1dbff94b@schienar/ >>> it sounds a lot like "drm/amdgpu: allow more APUs to do mode2 reset when >>> go to S4" might be fixing this, which went into 6.1.23. >> Yes, 6.1.23 seems OK so far. >> > > > The patch " drm/amdgpu: allow more APUs to do mode2 reset when go to S4" is to fix this hibernate regression issue. > > Sorry to have troubled you. No problem, please don't take it personally. It wasn't a big deal and I was just a bit grumpy yesterday. Thanks for the info and have a good day! Rainer

2 years, 7 months

1
0
0 0

[PATCH] LoongArch: module: set section addresses to 0x0

by Huacai Chen

These got*, plt* and .text.ftrace_trampoline sections specified for LoongArch have non-zero addressses. Non-zero section addresses in a relocatable ELF would confuse GDB when it tries to compute the section offsets and it ends up printing wrong symbol addresses. Therefore, set them to zero, which mirrors the change in commit 5d8591bc0fbaeb6ded ("arm64 module: set plt* section addresses to 0x0"). Cc: stable(a)vger.kernel.org Signed-off-by: Chong Qiao <qiaochong(a)loongson.cn> Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn> --- arch/loongarch/include/asm/module.lds.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/module.lds.h b/arch/loongarch/include/asm/module.lds.h index 438f09d4ccf4..88554f92e010 100644 --- a/arch/loongarch/include/asm/module.lds.h +++ b/arch/loongarch/include/asm/module.lds.h @@ -2,8 +2,8 @@ /* Copyright (C) 2020-2022 Loongson Technology Corporation Limited */ SECTIONS { . = ALIGN(4); - .got : { BYTE(0) } - .plt : { BYTE(0) } - .plt.idx : { BYTE(0) } - .ftrace_trampoline : { BYTE(0) } + .got 0 : { BYTE(0) } + .plt 0 : { BYTE(0) } + .plt.idx 0 : { BYTE(0) } + .ftrace_trampoline 0 : { BYTE(0) } } -- 2.39.1

2 years, 7 months

2
1
0 0

[PATCH] md/raid10: fix deadlock when handle read error and running data-check at same time

by linminggui

when running data-check and ecounter a normal IO errror, raid10d handle the error, one resync IO added into conf->retry_list waiting for raid10d to handle it, so barrier will not drop to zero and the normal IO(read error) will stuck in wait_barrier in raid10_read_request. after this, resyc thread will stuck in raise_barrier, other process will stuck in wait_barrier. Ignore barrier for read error retry in raid10_read_request to avoid deadlock. for kernel linux-4.19.y processA md0_raid10 md0_resync processB ------------------------------------------------------------------------- | | | | read io error | | | | handle_read_error raise_barrier | | | (nr_pending=1,barrier=1) | | | wait_barrier | | (nr_waiting=1,barrier=1) allow_barrier | | (nr_pending=0) | | | | | conf->retry_list | | | | wait_barrier (nr_waiting=2,barrier=1) [ 1452.065519] INFO: task md0_raid10:381 blocked for more than 120 seconds. [ 1452.065852] Tainted: G OE K 4.19.280 #2 [ 1452.066018] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1452.066189] md0_raid10 D 0 381 2 0x80000000 [ 1452.066191] Call Trace: [ 1452.066197] __schedule+0x3f8/0x8b0 [ 1452.066199] schedule+0x36/0x80 [ 1452.066201] wait_barrier+0x150/0x1b0 [ 1452.066203] ? wait_woken+0x80/0x80 [ 1452.066205] raid10_read_request+0xa8/0x510 [ 1452.066206] handle_read_error+0xa9/0x220 [ 1452.066207] ? pick_next_task_fair+0x15d/0x610 [ 1452.066208] raid10d+0xa01/0x1510 [ 1452.066210] ? schedule+0x36/0x80 [ 1452.066211] md_thread+0x133/0x180 [ 1452.066212] ? md_thread+0x133/0x180 [ 1452.066213] ? wait_woken+0x80/0x80 [ 1452.066214] kthread+0x105/0x140 Signed-off-by: linminggui <linminggui1(a)bigo.sg> --- drivers/md/raid10.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 9f9cd2f..9f00400 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1137,6 +1137,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, int slot = r10_bio->read_slot; struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; + bool error_retry = false; if (slot >= 0 && r10_bio->devs[slot].rdev) { /* @@ -1153,6 +1154,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, */ gfp = GFP_NOIO | __GFP_HIGH; + error_retry = true; + atomic_inc(&conf->nr_pending); + rcu_read_lock(); disk = r10_bio->devs[slot].devnum; err_rdev = rcu_dereference(conf->mirrors[disk].rdev); @@ -1169,8 +1173,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. + * Ignore barrier if this is an error retry. */ - wait_barrier(conf); + if (!error_retry) + wait_barrier(conf); sectors = r10_bio->sectors; while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && @@ -1181,12 +1187,14 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, * pass */ raid10_log(conf->mddev, "wait reshape"); - allow_barrier(conf); + if (!error_retry) + allow_barrier(conf); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress >= bio->bi_iter.bi_sector + sectors); - wait_barrier(conf); + if (!error_retry) + wait_barrier(conf); } rdev = read_balance(conf, r10_bio, &max_sectors); @@ -1208,9 +1216,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); bio_chain(split, bio); - allow_barrier(conf); + if (!error_retry) + allow_barrier(conf); generic_make_request(bio); - wait_barrier(conf); + if (!error_retry) + wait_barrier(conf); bio = split; r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; -- 2.7.4

2 years, 7 months

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror