From: Stefano Brivio sbrivio@redhat.com
[ Upstream commit 8cc4ccf58379935f3ad456cc34e61c4e4c921d0e ]
There doesn't seem to be any reason to restrict MAC address matching to source MAC addresses in set types bitmap:ipmac, hash:ipmac and hash:mac. With this patch, and this setup:
ip netns add A ip link add veth1 type veth peer name veth2 netns A ip addr add 192.0.2.1/24 dev veth1 ip -net A addr add 192.0.2.2/24 dev veth2 ip link set veth1 up ip -net A link set veth2 up
ip netns exec A ipset create test hash:mac dst=$(ip netns exec A cat /sys/class/net/veth2/address) ip netns exec A ipset add test ${dst} ip netns exec A iptables -P INPUT DROP ip netns exec A iptables -I INPUT -m set --match-set test dst -j ACCEPT
ipset will match packets based on destination MAC address:
# ping -c1 192.0.2.2 >/dev/null # echo $? 0
Reported-by: Yi Chen yiche@redhat.com Signed-off-by: Stefano Brivio sbrivio@redhat.com Signed-off-by: Jozsef Kadlecsik kadlec@blackhole.kfki.hu Signed-off-by: Sasha Levin sashal@kernel.org --- net/netfilter/ipset/ip_set_bitmap_ipmac.c | 10 +++++----- net/netfilter/ipset/ip_set_hash_ipmac.c | 16 ++++++++++------ net/netfilter/ipset/ip_set_hash_mac.c | 10 +++++----- 3 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index c00b6a2e8e3c..13ade5782847 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -219,10 +219,6 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip;
- /* MAC can be src only */ - if (!(opt->flags & IPSET_DIM_TWO_SRC)) - return 0; - ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; @@ -233,7 +229,11 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL;
e.id = ip_to_id(map, ip); - memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } diff --git a/net/netfilter/ipset/ip_set_hash_ipmac.c b/net/netfilter/ipset/ip_set_hash_ipmac.c index 1ab5ed2f6839..fd87de3ed55b 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmac.c +++ b/net/netfilter/ipset/ip_set_hash_ipmac.c @@ -103,7 +103,11 @@ hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb, (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL;
- memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); + if (ether_addr_equal(e.ether, invalid_ether)) return -EINVAL;
@@ -211,15 +215,15 @@ hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- /* MAC can be src only */ - if (!(opt->flags & IPSET_DIM_TWO_SRC)) - return 0; - if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL;
- memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN); + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); + if (ether_addr_equal(e.ether, invalid_ether)) return -EINVAL;
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index f9d5a2a1e3d0..4fe5f243d0a3 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -81,15 +81,15 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, struct hash_mac4_elem e = { { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- /* MAC can be src only */ - if (!(opt->flags & IPSET_DIM_ONE_SRC)) - return 0; - if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL;
- ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + if (opt->flags & IPSET_DIM_ONE_SRC) + ether_addr_copy(e.ether, eth_hdr(skb)->h_source); + else + ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); + if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
From: Manivannan Sadhasivam manivannan.sadhasivam@linaro.org
[ Upstream commit ed8dce4c6f726b7f3c6bf40859b92a9e32f189c1 ]
Keeping the irq_chip definition static will make it shared with multiple giochips in the system. This practice is considered to be bad and now we will get the below warning from gpiolib core:
"detected irqchip that is shared with multiple gpiochips: please fix the driver."
Hence, move the irq_chip definition from static to `struct pl061` for using a unique irq_chip for each gpiochip.
Signed-off-by: Manivannan Sadhasivam manivannan.sadhasivam@linaro.org Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpio/gpio-pl061.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/drivers/gpio/gpio-pl061.c b/drivers/gpio/gpio-pl061.c index 2afd9de84a0d..dc42571e6fdc 100644 --- a/drivers/gpio/gpio-pl061.c +++ b/drivers/gpio/gpio-pl061.c @@ -54,6 +54,7 @@ struct pl061 {
void __iomem *base; struct gpio_chip gc; + struct irq_chip irq_chip; int parent_irq;
#ifdef CONFIG_PM @@ -281,15 +282,6 @@ static int pl061_irq_set_wake(struct irq_data *d, unsigned int state) return irq_set_irq_wake(pl061->parent_irq, state); }
-static struct irq_chip pl061_irqchip = { - .name = "pl061", - .irq_ack = pl061_irq_ack, - .irq_mask = pl061_irq_mask, - .irq_unmask = pl061_irq_unmask, - .irq_set_type = pl061_irq_type, - .irq_set_wake = pl061_irq_set_wake, -}; - static int pl061_probe(struct amba_device *adev, const struct amba_id *id) { struct device *dev = &adev->dev; @@ -328,6 +320,13 @@ static int pl061_probe(struct amba_device *adev, const struct amba_id *id) /* * irq_chip support */ + pl061->irq_chip.name = dev_name(dev); + pl061->irq_chip.irq_ack = pl061_irq_ack; + pl061->irq_chip.irq_mask = pl061_irq_mask; + pl061->irq_chip.irq_unmask = pl061_irq_unmask; + pl061->irq_chip.irq_set_type = pl061_irq_type; + pl061->irq_chip.irq_set_wake = pl061_irq_set_wake; + writeb(0, pl061->base + GPIOIE); /* disable irqs */ irq = adev->irq[0]; if (irq < 0) { @@ -336,14 +335,14 @@ static int pl061_probe(struct amba_device *adev, const struct amba_id *id) } pl061->parent_irq = irq;
- ret = gpiochip_irqchip_add(&pl061->gc, &pl061_irqchip, + ret = gpiochip_irqchip_add(&pl061->gc, &pl061->irq_chip, 0, handle_bad_irq, IRQ_TYPE_NONE); if (ret) { dev_info(&adev->dev, "could not add irqchip\n"); return ret; } - gpiochip_set_chained_irqchip(&pl061->gc, &pl061_irqchip, + gpiochip_set_chained_irqchip(&pl061->gc, &pl061->irq_chip, irq, pl061_irq_handler);
amba_set_drvdata(adev, pl061);
From: Nicholas Kazlauskas nicholas.kazlauskas@amd.com
[ Upstream commit f41a895026b8cb6f765190de7d2e7bc3ccbbd183 ]
[Why]
The igt@kms_plane@pixel-format-pipe tests can create a sequence where stream_state is NULL during amdgpu_dm_crtc_set_crc_source which results in a null pointer dereference.
[How]
Guard against stream_state being NULL before accessing its fields. This doesn't fix the root cause of the issue so a DRM_ERROR is generated to still fail the tests.
Signed-off-by: Nicholas Kazlauskas nicholas.kazlauskas@amd.com Reviewed-by: David Francis David.Francis@amd.com Acked-by: Leo Li sunpeng.li@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c index 9bfb040352e9..6a6d977ddd7a 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c @@ -60,6 +60,11 @@ int amdgpu_dm_crtc_set_crc_source(struct drm_crtc *crtc, const char *src_name, return -EINVAL; }
+ if (!stream_state) { + DRM_ERROR("No stream state for CRTC%d\n", crtc->index); + return -EINVAL; + } + /* When enabling CRC, we should also disable dithering. */ if (source == AMDGPU_DM_PIPE_CRC_SOURCE_AUTO) { if (dc_stream_configure_crc(stream_state->ctx->dc,
From: Christian König christian.koenig@amd.com
[ Upstream commit 2383a767c0ca06f96534456d8313909017c6c8d0 ]
Vega10 has multiple interrupt rings, so this can be called from multiple calles at the same time resulting in:
[ 71.779334] ================================ [ 71.779406] WARNING: inconsistent lock state [ 71.779478] 4.19.0-rc1+ #44 Tainted: G W [ 71.779565] -------------------------------- [ 71.779637] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. [ 71.779740] kworker/6:1/120 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 71.779832] 00000000ad761971 (&(&kfd->interrupt_lock)->rlock){?...}, at: kgd2kfd_interrupt+0x75/0x100 [amdgpu] [ 71.780058] {IN-HARDIRQ-W} state was registered at: [ 71.780115] _raw_spin_lock+0x2c/0x40 [ 71.780180] kgd2kfd_interrupt+0x75/0x100 [amdgpu] [ 71.780248] amdgpu_irq_callback+0x6c/0x150 [amdgpu] [ 71.780315] amdgpu_ih_process+0x88/0x100 [amdgpu] [ 71.780380] amdgpu_irq_handler+0x20/0x40 [amdgpu] [ 71.780409] __handle_irq_event_percpu+0x49/0x2a0 [ 71.780436] handle_irq_event_percpu+0x30/0x70 [ 71.780461] handle_irq_event+0x37/0x60 [ 71.780484] handle_edge_irq+0x83/0x1b0 [ 71.780506] handle_irq+0x1f/0x30 [ 71.780526] do_IRQ+0x53/0x110 [ 71.780544] ret_from_intr+0x0/0x22 [ 71.780566] cpuidle_enter_state+0xaa/0x330 [ 71.780591] do_idle+0x203/0x280 [ 71.780610] cpu_startup_entry+0x6f/0x80 [ 71.780634] start_secondary+0x1b0/0x200 [ 71.780657] secondary_startup_64+0xa4/0xb0
Fix this by always using irq save spin locks.
Signed-off-by: Christian König christian.koenig@amd.com Acked-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 1427675d0e5a..5aba50f63ac6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -661,6 +661,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) { uint32_t patched_ihre[KFD_MAX_RING_ENTRY_SIZE]; bool is_patched = false; + unsigned long flags;
if (!kfd->init_complete) return; @@ -670,7 +671,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) return; }
- spin_lock(&kfd->interrupt_lock); + spin_lock_irqsave(&kfd->interrupt_lock, flags);
if (kfd->interrupts_active && interrupt_is_wanted(kfd, ih_ring_entry, @@ -679,7 +680,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) is_patched ? patched_ihre : ih_ring_entry)) queue_work(kfd->ih_wq, &kfd->interrupt_work);
- spin_unlock(&kfd->interrupt_lock); + spin_unlock_irqrestore(&kfd->interrupt_lock, flags); }
int kgd2kfd_quiesce_mm(struct mm_struct *mm)
From: Shannon Nelson shannon.nelson@oracle.com
[ Upstream commit 7fa57ca443cffe81ce8416b57966bfb0370678a1 ]
When it's possible that the PF might end up trying to send a packet to one of its own VFs, we have to forbid IPsec offload because the device drops the packets into a black hole. See commit 47b6f50077e6 ("ixgbe: disallow IPsec Tx offload when in SR-IOV mode") for more info.
This really is only necessary when the device is in the default VEB mode. If instead the device is running in VEPA mode, the packets will go through the encryption engine and out the MAC/PHY as normal, and get "hairpinned" as needed by the switch.
So let's not block IPsec offload when in VEPA mode. To get there with the ixgbe device, use the handy 'bridge' command: bridge link set dev eth1 hwmode vepa
Signed-off-by: Shannon Nelson shannon.nelson@oracle.com Tested-by: Andrew Bowers andrewx.bowers@intel.com Signed-off-by: Jeff Kirsher jeffrey.t.kirsher@intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index add124e0381d..b27f7a968820 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -4,6 +4,7 @@ #include "ixgbe.h" #include <net/xfrm.h> #include <crypto/aead.h> +#include <linux/if_bridge.h>
/** * ixgbe_ipsec_set_tx_sa - set the Tx SA registers @@ -676,7 +677,8 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs) } else { struct tx_sa tsa;
- if (adapter->num_vfs) + if (adapter->num_vfs && + adapter->bridge_mode != BRIDGE_MODE_VEPA) return -EOPNOTSUPP;
/* find the first unused index */
From: João Paulo Rechi Vita jprvita@gmail.com
[ Upstream commit 78f3ac76d9e5219589718b9e4733bee21627b3f5 ]
In the past, Asus firmwares would change the panel backlight directly through the EC when the display off hotkey (Fn+F7) was pressed, and only notify the OS of such change, with 0x33 when the LCD was ON and 0x34 when the LCD was OFF. These are currently mapped to KEY_DISPLAYTOGGLE and KEY_DISPLAY_OFF, respectively.
Most recently the EC on Asus most machines lost ability to toggle the LCD backlight directly, but unless the OS informs the firmware it is going to handle the display toggle hotkey events, the firmware still tries change the brightness through the EC, to no effect. The end result is a long list (at Endless we counted 11) of Asus laptop models where the display toggle hotkey does not perform any action. Our firmware engineers contacts at Asus were surprised that there were still machines out there with the old behavior.
Calling WMNB(ASUS_WMI_DEVID_BACKLIGHT==0x00050011, 2) on the _WDG device tells the firmware that it should let the OS handle the display toggle event, in which case it will simply notify the OS of a key press with 0x35, as shown by the DSDT excerpts bellow.
Scope (_SB) { (...)
Device (ATKD) { (...)
Name (_WDG, Buffer (0x28) { /* 0000 */ 0xD0, 0x5E, 0x84, 0x97, 0x6D, 0x4E, 0xDE, 0x11, /* 0008 */ 0x8A, 0x39, 0x08, 0x00, 0x20, 0x0C, 0x9A, 0x66, /* 0010 */ 0x4E, 0x42, 0x01, 0x02, 0x35, 0xBB, 0x3C, 0x0B, /* 0018 */ 0xC2, 0xE3, 0xED, 0x45, 0x91, 0xC2, 0x4C, 0x5A, /* 0020 */ 0x6D, 0x19, 0x5D, 0x1C, 0xFF, 0x00, 0x01, 0x08 }) Method (WMNB, 3, Serialized) { CreateDWordField (Arg2, Zero, IIA0) CreateDWordField (Arg2, 0x04, IIA1) Local0 = (Arg1 & 0xFFFFFFFF)
(...)
If ((Local0 == 0x53564544)) { (...)
If ((IIA0 == 0x00050011)) { If ((IIA1 == 0x02)) { ^^PCI0.SBRG.EC0.SPIN (0x72, One) ^^PCI0.SBRG.EC0.BLCT = One }
Return (One) } } (...) } (...) } (...) } (...)
Scope (_SB.PCI0.SBRG.EC0) { (...)
Name (BLCT, Zero)
(...)
Method (_Q10, 0, NotSerialized) // _Qxx: EC Query { If ((BLCT == Zero)) { Local0 = One Local0 = RPIN (0x72) Local0 ^= One SPIN (0x72, Local0) If (ATKP) { Local0 = (0x34 - Local0) ^^^^ATKD.IANE (Local0) } } ElseIf ((BLCT == One)) { If (ATKP) { ^^^^ATKD.IANE (0x35) } } } (...) }
Signed-off-by: João Paulo Rechi Vita jprvita@endlessm.com Signed-off-by: Andy Shevchenko andriy.shevchenko@linux.intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/platform/x86/asus-wmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 2d6e272315a8..db3556dc90d1 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -2231,7 +2231,8 @@ static int asus_wmi_add(struct platform_device *pdev) err = asus_wmi_backlight_init(asus); if (err && err != -ENODEV) goto fail_backlight; - } + } else + err = asus_wmi_set_devstate(ASUS_WMI_DEVID_BACKLIGHT, 2, NULL);
status = wmi_install_notify_handler(asus->driver->event_guid, asus_wmi_notify, asus);
From: Miroslav Lichvar mlichvar@redhat.com
[ Upstream commit e1f65b0d70e9e5c80e15105cd96fa00174d7c436 ]
It seems with some NICs supported by the e1000e driver a SYSTIM reading may occasionally be few microseconds before the previous reading and if enabled also pass e1000e_sanitize_systim() without reaching the maximum number of rereads, even if the function is modified to check three consecutive readings (i.e. it doesn't look like a double read error). This causes an underflow in the timecounter and the PHC time jumps hours ahead.
This was observed on 82574, I217 and I219. The fastest way to reproduce it is to run a program that continuously calls the PTP_SYS_OFFSET ioctl on the PHC.
Modify e1000e_phc_gettime() to use timecounter_cyc2time() instead of timecounter_read() in order to allow non-monotonic SYSTIM readings and prevent the PHC from jumping.
Cc: Richard Cochran richardcochran@gmail.com Signed-off-by: Miroslav Lichvar mlichvar@redhat.com Acked-by: Jacob Keller jacob.e.keller@intel.com Tested-by: Aaron Brown aaron.f.brown@intel.com Signed-off-by: Jeff Kirsher jeffrey.t.kirsher@intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/net/ethernet/intel/e1000e/ptp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c index 37c76945ad9b..e1f821edbc21 100644 --- a/drivers/net/ethernet/intel/e1000e/ptp.c +++ b/drivers/net/ethernet/intel/e1000e/ptp.c @@ -173,10 +173,14 @@ static int e1000e_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter, ptp_clock_info); unsigned long flags; - u64 ns; + u64 cycles, ns;
spin_lock_irqsave(&adapter->systim_lock, flags); - ns = timecounter_read(&adapter->tc); + + /* Use timecounter_cyc2time() to allow non-monotonic SYSTIM readings */ + cycles = adapter->cc.read(&adapter->cc); + ns = timecounter_cyc2time(&adapter->tc, cycles); + spin_unlock_irqrestore(&adapter->systim_lock, flags);
*ts = ns_to_timespec64(ns); @@ -232,9 +236,12 @@ static void e1000e_systim_overflow_work(struct work_struct *work) systim_overflow_work.work); struct e1000_hw *hw = &adapter->hw; struct timespec64 ts; + u64 ns;
- adapter->ptp_clock_info.gettime64(&adapter->ptp_clock_info, &ts); + /* Update the timecounter */ + ns = timecounter_read(&adapter->tc);
+ ts = ns_to_timespec64(ns); e_dbg("SYSTIM overflow check at %lld.%09lu\n", (long long) ts.tv_sec, ts.tv_nsec);
From: Badhri Jagan Sridharan badhri@google.com
[ Upstream commit 23b5f73266e59a598c1e5dd435d87651b5a7626b ]
During HARD_RESET the data link is disconnected. For self powered device, the spec is advising against doing that.
From USB_PD_R3_0
7.1.5 Response to Hard Resets Device operation during and after a Hard Reset is defined as follows: Self-powered devices Should Not disconnect from USB during a Hard Reset (see Section 9.1.2). Bus powered devices will disconnect from USB during a Hard Reset due to the loss of their power source.
Tackle this by letting TCPM know whether the device is self or bus powered.
This overcomes unnecessary port disconnections from hard reset. Also, speeds up the enumeration time when connected to Type-A ports.
Signed-off-by: Badhri Jagan Sridharan badhri@google.com Reviewed-by: Heikki Krogerus heikki.krogerus@linux.intel.com --------- Version history: V3: Rebase on top of usb-next
V2: Based on feedback from heikki.krogerus@linux.intel.com - self_powered added to the struct tcpm_port which is populated from a. "connector" node of the device tree in tcpm_fw_get_caps() b. "self_powered" node of the tcpc_config in tcpm_copy_caps
Based on feedbase from linux@roeck-us.net - Code was refactored - SRC_HARD_RESET_VBUS_OFF sets the link state to false based on self_powered flag
V1 located here: https://lkml.org/lkml/2018/9/13/94 Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/usb/typec/tcpm.c | 12 ++++++++++-- include/linux/usb/tcpm.h | 1 + 2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/usb/typec/tcpm.c b/drivers/usb/typec/tcpm.c index c74cc9c309b1..3457c1fdebd1 100644 --- a/drivers/usb/typec/tcpm.c +++ b/drivers/usb/typec/tcpm.c @@ -317,6 +317,9 @@ struct tcpm_port { /* Deadline in jiffies to exit src_try_wait state */ unsigned long max_wait;
+ /* port belongs to a self powered device */ + bool self_powered; + #ifdef CONFIG_DEBUG_FS struct dentry *dentry; struct mutex logbuffer_lock; /* log buffer access lock */ @@ -3257,7 +3260,8 @@ static void run_state_machine(struct tcpm_port *port) case SRC_HARD_RESET_VBUS_OFF: tcpm_set_vconn(port, true); tcpm_set_vbus(port, false); - tcpm_set_roles(port, false, TYPEC_SOURCE, TYPEC_HOST); + tcpm_set_roles(port, port->self_powered, TYPEC_SOURCE, + TYPEC_HOST); tcpm_set_state(port, SRC_HARD_RESET_VBUS_ON, PD_T_SRC_RECOVER); break; case SRC_HARD_RESET_VBUS_ON: @@ -3270,7 +3274,8 @@ static void run_state_machine(struct tcpm_port *port) memset(&port->pps_data, 0, sizeof(port->pps_data)); tcpm_set_vconn(port, false); tcpm_set_charge(port, false); - tcpm_set_roles(port, false, TYPEC_SINK, TYPEC_DEVICE); + tcpm_set_roles(port, port->self_powered, TYPEC_SINK, + TYPEC_DEVICE); /* * VBUS may or may not toggle, depending on the adapter. * If it doesn't toggle, transition to SNK_HARD_RESET_SINK_ON @@ -4415,6 +4420,8 @@ static int tcpm_fw_get_caps(struct tcpm_port *port, return -EINVAL; port->operating_snk_mw = mw / 1000;
+ port->self_powered = fwnode_property_read_bool(fwnode, "self-powered"); + return 0; }
@@ -4723,6 +4730,7 @@ static int tcpm_copy_caps(struct tcpm_port *port, port->typec_caps.prefer_role = tcfg->default_role; port->typec_caps.type = tcfg->type; port->typec_caps.data = tcfg->data; + port->self_powered = port->tcpc->config->self_powered;
return 0; } diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index 7e7fbfb84e8e..50c74a77db55 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -89,6 +89,7 @@ struct tcpc_config { enum typec_port_data data; enum typec_role default_role; bool try_role_hw; /* try.{src,snk} implemented in hardware */ + bool self_powered; /* port belongs to a self powered device */
const struct typec_altmode_desc *alt_modes; };
From: Quentin Monnet quentin.monnet@netronome.com
[ Upstream commit f96afa767baffba7645f5e10998f5178948bb9aa ]
libbpf is now able to load successfully test_l4lb_noinline.o and samples/bpf/tracex3_kern.o.
For the test_l4lb_noinline, uncomment related tests from test_libbpf.c and remove the associated "TODO".
For tracex3_kern.o, instead of loading a program from samples/bpf/ that might not have been compiled at this stage, try loading a program from BPF selftests. Since this test case is about loading a program compiled without the "-target bpf" flag, change the Makefile to compile one program accordingly (instead of passing the flag for compiling all programs).
Regarding test_xdp_noinline.o: in its current shape the program fails to load because it provides no version section, but the loader needs one. The test was added to make sure that libbpf could load XDP programs even if they do not provide a version number in a dedicated section. But libbpf is already capable of doing that: in our case loading fails because the loader does not know that this is an XDP program (it does not need to, since it does not attach the program). So trying to load test_xdp_noinline.o does not bring much here: just delete this subtest.
For the record, the error message obtained with tracex3_kern.o was fixed by commit e3d91b0ca523 ("tools/libbpf: handle issues with bpf ELF objects containing .eh_frames")
I have not been abled to reproduce the "libbpf: incorrect bpf_call opcode" error for test_l4lb_noinline.o, even with the version of libbpf present at the time when test_libbpf.sh and test_libbpf_open.c were created.
RFC -> v1: - Compile test_xdp without the "-target bpf" flag, and try to load it instead of ../../samples/bpf/tracex3_kern.o. - Delete test_xdp_noinline.o subtest.
Cc: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Quentin Monnet quentin.monnet@netronome.com Acked-by: Jakub Kicinski jakub.kicinski@netronome.com Acked-by: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Sasha Levin sashal@kernel.org --- tools/testing/selftests/bpf/Makefile | 10 ++++++++++ tools/testing/selftests/bpf/test_libbpf.sh | 14 ++++---------- 2 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index fff7fb1285fc..f3f874ba186b 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -124,6 +124,16 @@ endif endif endif
+# Have one program compiled without "-target bpf" to test whether libbpf loads +# it successfully +$(OUTPUT)/test_xdp.o: test_xdp.c + $(CLANG) $(CLANG_FLAGS) \ + -O2 -emit-llvm -c $< -o - | \ + $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@ +ifeq ($(DWARF2BTF),y) + $(BTF_PAHOLE) -J $@ +endif + $(OUTPUT)/%.o: %.c $(CLANG) $(CLANG_FLAGS) \ -O2 -target bpf -emit-llvm -c $< -o - | \ diff --git a/tools/testing/selftests/bpf/test_libbpf.sh b/tools/testing/selftests/bpf/test_libbpf.sh index d97dc914cd49..8b1bc96d8e0c 100755 --- a/tools/testing/selftests/bpf/test_libbpf.sh +++ b/tools/testing/selftests/bpf/test_libbpf.sh @@ -33,17 +33,11 @@ trap exit_handler 0 2 3 6 9
libbpf_open_file test_l4lb.o
-# TODO: fix libbpf to load noinline functions -# [warning] libbpf: incorrect bpf_call opcode -#libbpf_open_file test_l4lb_noinline.o +# Load a program with BPF-to-BPF calls +libbpf_open_file test_l4lb_noinline.o
-# TODO: fix test_xdp_meta.c to load with libbpf -# [warning] libbpf: test_xdp_meta.o doesn't provide kernel version -#libbpf_open_file test_xdp_meta.o - -# TODO: fix libbpf to handle .eh_frame -# [warning] libbpf: relocation failed: no section(10) -#libbpf_open_file ../../../../samples/bpf/tracex3_kern.o +# Load a program compiled without the "-target bpf" flag +libbpf_open_file test_xdp.o
# Success exit 0
From: Frank Rowand frank.rowand@sony.com
[ Upstream commit 7c528e457d53c75107d5aa56892316d265c778de ]
The refcount of a newly added overlay node decrements to one (instead of zero) when the overlay changeset is destroyed. This change will cause the final decrement be to zero.
After applying this patch, new validation warnings will be reported from the devicetree unittest during boot due to a pre-existing devicetree bug. The warnings will be similar to:
OF: ERROR: memory leak before free overlay changeset, /testcase-data/overlay-node/test-bus/test-unittest4
This pre-existing devicetree bug will also trigger a WARN_ONCE() from refcount_sub_and_test_checked() when an overlay changeset is destroyed without having first been applied. This scenario occurs when an error in the overlay is detected during the overlay changeset creation:
WARNING: CPU: 0 PID: 1 at lib/refcount.c:187 refcount_sub_and_test_checked+0xa8/0xbc refcount_t: underflow; use-after-free.
(unwind_backtrace) from (show_stack+0x10/0x14) (show_stack) from (dump_stack+0x6c/0x8c) (dump_stack) from (__warn+0xdc/0x104) (__warn) from (warn_slowpath_fmt+0x44/0x6c) (warn_slowpath_fmt) from (refcount_sub_and_test_checked+0xa8/0xbc) (refcount_sub_and_test_checked) from (kobject_put+0x24/0x208) (kobject_put) from (of_changeset_destroy+0x2c/0xb4) (of_changeset_destroy) from (free_overlay_changeset+0x1c/0x9c) (free_overlay_changeset) from (of_overlay_remove+0x284/0x2cc) (of_overlay_remove) from (of_unittest_apply_revert_overlay_check.constprop.4+0xf8/0x1e8) (of_unittest_apply_revert_overlay_check.constprop.4) from (of_unittest_overlay+0x960/0xed8) (of_unittest_overlay) from (of_unittest+0x1cc4/0x2138) (of_unittest) from (do_one_initcall+0x4c/0x28c) (do_one_initcall) from (kernel_init_freeable+0x29c/0x378) (kernel_init_freeable) from (kernel_init+0x8/0x110) (kernel_init) from (ret_from_fork+0x14/0x2c)
Tested-by: Alan Tull atull@kernel.org Signed-off-by: Frank Rowand frank.rowand@sony.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/of/overlay.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c index eda57ef12fd0..baa9cee6fa2c 100644 --- a/drivers/of/overlay.c +++ b/drivers/of/overlay.c @@ -378,7 +378,9 @@ static int add_changeset_node(struct overlay_changeset *ovcs, if (ret) return ret;
- return build_changeset_next_level(ovcs, tchild, node); + ret = build_changeset_next_level(ovcs, tchild, node); + of_node_put(tchild); + return ret; }
if (node->phandle && tchild->phandle)
From: Anders Roxell anders.roxell@linaro.org
[ Upstream commit 347a28b586802d09604a149c1a1f6de5dccbe6fa ]
This happened while running in qemu-system-aarch64, the AMBA PL011 UART driver when enabling CONFIG_DEBUG_TEST_DRIVER_REMOVE. arch_initcall(pl011_init) came before subsys_initcall(default_bdi_init), devtmpfs' handle_remove() crashes because the reference count is a NULL pointer only because wb->bdi hasn't been initialized yet.
Rework so that wb_put have an extra check if wb->bdi before decrement wb->refcnt and also add a WARN_ON_ONCE to get a warning if it happens again in other drivers.
Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Co-developed-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Anders Roxell anders.roxell@linaro.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/backing-dev-defs.h | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 9a6bc0951cfa..c31157135598 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -258,6 +258,14 @@ static inline void wb_get(struct bdi_writeback *wb) */ static inline void wb_put(struct bdi_writeback *wb) { + if (WARN_ON_ONCE(!wb->bdi)) { + /* + * A driver bug might cause a file to be removed before bdi was + * initialized. + */ + return; + } + if (wb != &wb->bdi->wb) percpu_ref_put(&wb->refcnt); }
From: Anders Roxell anders.roxell@linaro.org
[ Upstream commit 646097940ad35aa2c1f2012af932d55976a9f255 ]
When the test 'CONFIG_DEBUG_TEST_DRIVER_REMOVE=y' is enabled, arch_initcall(pl011_init) came before subsys_initcall(default_bdi_init). devtmpfs gets killed because we try to remove a file and decrement the wb reference count before the noop_backing_device_info gets initialized.
[ 0.332075] Serial: AMBA PL011 UART driver [ 0.485276] 9000000.pl011: ttyAMA0 at MMIO 0x9000000 (irq = 39, base_baud = 0) is a PL011 rev1 [ 0.502382] console [ttyAMA0] enabled [ 0.515710] Unable to handle kernel paging request at virtual address 0000800074c12000 [ 0.516053] Mem abort info: [ 0.516222] ESR = 0x96000004 [ 0.516417] Exception class = DABT (current EL), IL = 32 bits [ 0.516641] SET = 0, FnV = 0 [ 0.516826] EA = 0, S1PTW = 0 [ 0.516984] Data abort info: [ 0.517149] ISV = 0, ISS = 0x00000004 [ 0.517339] CM = 0, WnR = 0 [ 0.517553] [0000800074c12000] user address but active_mm is swapper [ 0.517928] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 0.518305] Modules linked in: [ 0.518839] CPU: 0 PID: 13 Comm: kdevtmpfs Not tainted 4.19.0-rc5-next-20180928-00002-g2ba39ab0cd01-dirty #82 [ 0.519307] Hardware name: linux,dummy-virt (DT) [ 0.519681] pstate: 80000005 (Nzcv daif -PAN -UAO) [ 0.519959] pc : __destroy_inode+0x94/0x2a8 [ 0.520212] lr : __destroy_inode+0x78/0x2a8 [ 0.520401] sp : ffff0000098c3b20 [ 0.520590] x29: ffff0000098c3b20 x28: 00000000087a3714 [ 0.520904] x27: 0000000000002000 x26: 0000000000002000 [ 0.521179] x25: ffff000009583000 x24: 0000000000000000 [ 0.521467] x23: ffff80007bb52000 x22: ffff80007bbaa7c0 [ 0.521737] x21: ffff0000093f9338 x20: 0000000000000000 [ 0.522033] x19: ffff80007bbb05d8 x18: 0000000000000400 [ 0.522376] x17: 0000000000000000 x16: 0000000000000000 [ 0.522727] x15: 0000000000000400 x14: 0000000000000400 [ 0.523068] x13: 0000000000000001 x12: 0000000000000001 [ 0.523421] x11: 0000000000000000 x10: 0000000000000970 [ 0.523749] x9 : ffff0000098c3a60 x8 : ffff80007bbab190 [ 0.524017] x7 : ffff80007bbaa880 x6 : 0000000000000c88 [ 0.524305] x5 : ffff0000093d96c8 x4 : 61c8864680b583eb [ 0.524567] x3 : ffff0000093d6180 x2 : ffffffffffffffff [ 0.524872] x1 : 0000800074c12000 x0 : 0000800074c12000 [ 0.525207] Process kdevtmpfs (pid: 13, stack limit = 0x(____ptrval____)) [ 0.525529] Call trace: [ 0.525806] __destroy_inode+0x94/0x2a8 [ 0.526108] destroy_inode+0x34/0x88 [ 0.526370] evict+0x144/0x1c8 [ 0.526636] iput+0x184/0x230 [ 0.526871] dentry_unlink_inode+0x118/0x130 [ 0.527152] d_delete+0xd8/0xe0 [ 0.527420] vfs_unlink+0x240/0x270 [ 0.527665] handle_remove+0x1d8/0x330 [ 0.527875] devtmpfsd+0x138/0x1c8 [ 0.528085] kthread+0x14c/0x158 [ 0.528291] ret_from_fork+0x10/0x18 [ 0.528720] Code: 92800002 aa1403e0 d538d081 8b010000 (c85f7c04) [ 0.529367] ---[ end trace 5a3dee47727f877c ]---
Rework to set suppress_bind_attrs flag to avoid removing the device when CONFIG_DEBUG_TEST_DRIVER_REMOVE=y. This applies for pic32_uart and xilinx_uartps as well.
Co-developed-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Anders Roxell anders.roxell@linaro.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/tty/serial/amba-pl011.c | 2 ++ drivers/tty/serial/pic32_uart.c | 1 + drivers/tty/serial/xilinx_uartps.c | 1 + 3 files changed, 4 insertions(+)
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index ebd33c0232e6..89ade213a1a9 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2780,6 +2780,7 @@ static struct platform_driver arm_sbsa_uart_platform_driver = { .name = "sbsa-uart", .of_match_table = of_match_ptr(sbsa_uart_of_match), .acpi_match_table = ACPI_PTR(sbsa_uart_acpi_match), + .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_AMBA_PL011), }, };
@@ -2808,6 +2809,7 @@ static struct amba_driver pl011_driver = { .drv = { .name = "uart-pl011", .pm = &pl011_dev_pm_ops, + .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_AMBA_PL011), }, .id_table = pl011_ids, .probe = pl011_probe, diff --git a/drivers/tty/serial/pic32_uart.c b/drivers/tty/serial/pic32_uart.c index fd80d999308d..0bdf1687983f 100644 --- a/drivers/tty/serial/pic32_uart.c +++ b/drivers/tty/serial/pic32_uart.c @@ -919,6 +919,7 @@ static struct platform_driver pic32_uart_platform_driver = { .driver = { .name = PIC32_DEV_NAME, .of_match_table = of_match_ptr(pic32_serial_dt_ids), + .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_PIC32), }, };
diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index a48f19b1b88f..e5e9924ed7fb 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -1608,6 +1608,7 @@ static struct platform_driver cdns_uart_platform_driver = { .name = CDNS_UART_NAME, .of_match_table = cdns_uart_of_match, .pm = &cdns_uart_dev_pm_ops, + .suppress_bind_attrs = IS_BUILTIN(CONFIG_SERIAL_XILINX_PS_UART), }, };
From: Andrey Ignatov rdna@fb.com
[ Upstream commit 46f53a65d2de3e1591636c22b626b09d8684fd71 ]
Currently BPF verifier allows narrow loads for a context field only with offset zero. E.g. if there is a __u32 field then only the following loads are permitted: * off=0, size=1 (narrow); * off=0, size=2 (narrow); * off=0, size=4 (full).
On the other hand LLVM can generate a load with offset different than zero that make sense from program logic point of view, but verifier doesn't accept it.
E.g. tools/testing/selftests/bpf/sendmsg4_prog.c has code:
#define DST_IP4 0xC0A801FEU /* 192.168.1.254 */ ... if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) &&
where ctx is struct bpf_sock_addr.
Some versions of LLVM can produce the following byte code for it:
8: 71 12 07 00 00 00 00 00 r2 = *(u8 *)(r1 + 7) 9: 67 02 00 00 18 00 00 00 r2 <<= 24 10: 18 03 00 00 00 00 00 fe 00 00 00 00 00 00 00 00 r3 = 4261412864 ll 12: 5d 32 07 00 00 00 00 00 if r2 != r3 goto +7 <LBB0_6>
where `*(u8 *)(r1 + 7)` means narrow load for ctx->user_ip4 with size=1 and offset=3 (7 - sizeof(ctx->user_family) = 3). This load is currently rejected by verifier.
Verifier code that rejects such loads is in bpf_ctx_narrow_access_ok() what means any is_valid_access implementation, that uses the function, works this way, e.g. bpf_skb_is_valid_access() for __sk_buff or sock_addr_is_valid_access() for bpf_sock_addr.
The patch makes such loads supported. Offset can be in [0; size_default) but has to be multiple of load size. E.g. for __u32 field the following loads are supported now: * off=0, size=1 (narrow); * off=1, size=1 (narrow); * off=2, size=1 (narrow); * off=3, size=1 (narrow); * off=0, size=2 (narrow); * off=2, size=2 (narrow); * off=0, size=4 (full).
Reported-by: Yonghong Song yhs@fb.com Signed-off-by: Andrey Ignatov rdna@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/filter.h | 16 +--------------- kernel/bpf/verifier.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 20 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h index 6791a0ac0139..ec90d5255cf7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -665,24 +665,10 @@ static inline u32 bpf_ctx_off_adjust_machine(u32 size) return size; }
-static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access, - u32 size_default) -{ - size_default = bpf_ctx_off_adjust_machine(size_default); - size_access = bpf_ctx_off_adjust_machine(size_access); - -#ifdef __LITTLE_ENDIAN - return (off & (size_default - 1)) == 0; -#else - return (off & (size_default - 1)) + size_access == size_default; -#endif -} - static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { - return bpf_ctx_narrow_align_ok(off, size, size_default) && - size <= size_default && (size & (size - 1)) == 0; + return size <= size_default && (size & (size - 1)) == 0; }
#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2954e4b3abd5..89cea3ed535d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5341,10 +5341,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) int i, cnt, size, ctx_field_size, delta = 0; const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16], *insn; + u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; - u32 target_size;
if (ops->gen_prologue) { cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, @@ -5421,9 +5421,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * we will apply proper mask to the result. */ is_narrower_load = size < ctx_field_size; + size_default = bpf_ctx_off_adjust_machine(ctx_field_size); + off = insn->off; if (is_narrower_load) { - u32 size_default = bpf_ctx_off_adjust_machine(ctx_field_size); - u32 off = insn->off; u8 size_code;
if (type == BPF_WRITE) { @@ -5451,12 +5451,23 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) }
if (is_narrower_load && size < target_size) { - if (ctx_field_size <= 4) + u8 shift = (off & (size_default - 1)) * 8; + + if (ctx_field_size <= 4) { + if (shift) + insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); - else + } else { + if (shift) + insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, + insn->dst_reg, + shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, (1 << size * 8) - 1); + } }
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
From: Takashi Sakamoto o-takashi@sakamocchi.jp
[ Upstream commit fba43f454cdf9caa3185219d116bd2a6e6354552 ]
This commit adds support for APOGEE duet FireWire, launched 2007, already discontinued. This model uses Oxford Semiconductor FW971 as its communication engine. Below is information on Configuration ROM of this unit. The unit supports some AV/C commands defined by Audio subunit specification and vendor dependent commands.
$ ./hinawa-config-rom-printer /dev/fw1 { 'bus-info': { 'adj': False, 'bmc': False, 'chip_ID': 42949742248, 'cmc': False, 'cyc_clk_acc': 255, 'generation': 0, 'imc': False, 'isc': True, 'link_spd': 3, 'max_ROM': 0, 'max_rec': 64, 'name': '1394', 'node_vendor_ID': 987, 'pmc': False}, 'root-directory': [ ['VENDOR', 987], ['DESCRIPTOR', 'Apogee Electronics'], ['MODEL', 122333], ['DESCRIPTOR', 'Duet'], [ 'NODE_CAPABILITIES', { 'addressing': {'64': True, 'fix': True, 'prv': False}, 'misc': {'int': False, 'ms': False, 'spt': True}, 'state': { 'atn': False, 'ded': False, 'drq': True, 'elo': False, 'init': False, 'lst': True, 'off': False}, 'testing': {'bas': False, 'ext': False}}], [ 'UNIT', [ ['SPECIFIER_ID', 41005], ['VERSION', 65537], ['MODEL', 122333], ['DESCRIPTOR', 'Duet']]]]}
Signed-off-by: Takashi Sakamoto o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai tiwai@suse.de Signed-off-by: Sasha Levin sashal@kernel.org --- sound/firewire/Kconfig | 1 + sound/firewire/oxfw/oxfw.c | 8 ++++++++ 2 files changed, 9 insertions(+)
diff --git a/sound/firewire/Kconfig b/sound/firewire/Kconfig index 529d9f405fa9..0cb65d0864cc 100644 --- a/sound/firewire/Kconfig +++ b/sound/firewire/Kconfig @@ -41,6 +41,7 @@ config SND_OXFW * Mackie(Loud) U.420/U.420d * TASCAM FireOne * Stanton Controllers & Systems 1 Deck/Mixer + * APOGEE duet FireWire
To compile this driver as a module, choose M here: the module will be called snd-oxfw. diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c index 2ea8be6c8584..5f82a375725a 100644 --- a/sound/firewire/oxfw/oxfw.c +++ b/sound/firewire/oxfw/oxfw.c @@ -20,6 +20,7 @@ #define VENDOR_LACIE 0x00d04b #define VENDOR_TASCAM 0x00022e #define OUI_STANTON 0x001260 +#define OUI_APOGEE 0x0003db
#define MODEL_SATELLITE 0x00200f
@@ -436,6 +437,13 @@ static const struct ieee1394_device_id oxfw_id_table[] = { .vendor_id = OUI_STANTON, .model_id = 0x002000, }, + // APOGEE, duet FireWire + { + .match_flags = IEEE1394_MATCH_VENDOR_ID | + IEEE1394_MATCH_MODEL_ID, + .vendor_id = OUI_APOGEE, + .model_id = 0x01dddd, + }, { } }; MODULE_DEVICE_TABLE(ieee1394, oxfw_id_table);
From: Borislav Petkov bp@suse.de
[ Upstream commit 68b5e4326e4b8ac9080835005d8254fed0fb3c56 ]
Add the proper includes and make smca_get_name() static.
Fix an actual bug too which the warning triggered:
arch/x86/kernel/cpu/mcheck/therm_throt.c:395:39: error: conflicting \ types for ‘smp_thermal_interrupt’ asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) ^~~~~~~~~~~~~~~~~~~~~ In file included from arch/x86/kernel/cpu/mcheck/therm_throt.c:29: ./arch/x86/include/asm/traps.h:107:17: note: previous declaration of \ ‘smp_thermal_interrupt’ was here asmlinkage void smp_thermal_interrupt(void);
Signed-off-by: Borislav Petkov bp@suse.de Cc: Yi Wang wang.yi59@zte.com.cn Cc: Michael Matz matz@suse.de Cc: x86@kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1811081633160.1549@nanos.tec.linut... Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/include/asm/traps.h | 6 +++--- arch/x86/kernel/cpu/mcheck/mce_amd.c | 5 +++-- arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 ++- arch/x86/kernel/cpu/mcheck/threshold.c | 3 ++- 4 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 3de69330e6c5..afbc87206886 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -104,9 +104,9 @@ extern int panic_on_unrecovered_nmi;
void math_emulate(struct math_emu_info *); #ifndef CONFIG_X86_32 -asmlinkage void smp_thermal_interrupt(void); -asmlinkage void smp_threshold_interrupt(void); -asmlinkage void smp_deferred_error_interrupt(void); +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs); +asmlinkage void smp_threshold_interrupt(struct pt_regs *regs); +asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs); #endif
extern void ist_enter(struct pt_regs *regs); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index e12454e21b8a..9f915a8791cc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -23,6 +23,7 @@ #include <linux/string.h>
#include <asm/amd_nb.h> +#include <asm/traps.h> #include <asm/apic.h> #include <asm/mce.h> #include <asm/msr.h> @@ -99,7 +100,7 @@ static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } };
-const char *smca_get_name(enum smca_bank_types t) +static const char *smca_get_name(enum smca_bank_types t) { if (t >= N_SMCA_BANK_TYPES) return NULL; @@ -824,7 +825,7 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) mce_log(&m); }
-asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void) +asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs) { entering_irq(); trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 2da67b70ba98..ee229ceee745 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -25,6 +25,7 @@ #include <linux/cpu.h>
#include <asm/processor.h> +#include <asm/traps.h> #include <asm/apic.h> #include <asm/mce.h> #include <asm/msr.h> @@ -390,7 +391,7 @@ static void unexpected_thermal_interrupt(void)
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *r) +asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index 2b584b319eff..c21e0a1efd0f 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -6,6 +6,7 @@ #include <linux/kernel.h>
#include <asm/irq_vectors.h> +#include <asm/traps.h> #include <asm/apic.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> @@ -18,7 +19,7 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
-asmlinkage __visible void __irq_entry smp_threshold_interrupt(void) +asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
From: "Maciej W. Rozycki" macro@linux-mips.org
[ Upstream commit e4849aff1e169b86c561738daf8ff020e9de1011 ]
The Broadcom SiByte BCM1250, BCM1125, and BCM1125H SOCs have an onchip DRAM controller that supports memory amounts of up to 16GiB, and due to how the address decoder has been wired in the SOC any memory beyond 1GiB is actually mapped starting from 4GiB physical up, that is beyond the 32-bit addressable limit[1]. Consequently if the maximum amount of memory has been installed, then it will span up to 19GiB.
Many of the evaluation boards we support that are based on one of these SOCs have their memory soldered and the amount present fits in the 32-bit address range. The BCM91250A SWARM board however has actual DIMM slots and accepts, depending on the peripherals revision of the SOC, up to 4GiB or 8GiB of memory in commercially available JEDEC modules[2]. I believe this is also the case with the BCM91250C2 LittleSur board. This means that up to either 3GiB or 7GiB of memory requires 64-bit addressing to access.
I believe the BCM91480B BigSur board, which has the BCM1480 SOC instead, accepts at least as much memory, although I have no documentation or actual hardware available to verify that.
Both systems have PCI slots installed for use by any PCI option boards, including ones that only support 32-bit addressing (additionally the 32-bit PCI host bridge of the BCM1250, BCM1125, and BCM1125H SOCs limits addressing to 32-bits), and there is no IOMMU available. Therefore for PCI DMA to work in the presence of memory beyond enable swiotlb for the affected systems.
All the other SOC onchip DMA devices use 40-bit addressing and therefore can address the whole memory, so only enable swiotlb if PCI support and support for DMA beyond 4GiB have been both enabled in the configuration of the kernel.
This shows up as follows:
Broadcom SiByte BCM1250 B2 @ 800 MHz (SB1 rev 2) Board type: SiByte BCM91250A (SWARM) Determined physical RAM map: memory: 000000000fe7fe00 @ 0000000000000000 (usable) memory: 000000001ffffe00 @ 0000000080000000 (usable) memory: 000000000ffffe00 @ 00000000c0000000 (usable) memory: 0000000087fffe00 @ 0000000100000000 (usable) software IO TLB: mapped [mem 0xcbffc000-0xcfffc000] (64MB)
in the bootstrap log and removes failures like these:
defxx 0000:02:00.0: dma_direct_map_page: overflow 0x0000000185bc6080+4608 of device mask ffffffff bus mask 0 fddi0: Receive buffer allocation failed fddi0: Adapter open failed! IP-Config: Failed to open fddi0 defxx 0000:09:08.0: dma_direct_map_page: overflow 0x0000000185bc6080+4608 of device mask ffffffff bus mask 0 fddi1: Receive buffer allocation failed fddi1: Adapter open failed! IP-Config: Failed to open fddi1
when memory beyond 4GiB is handed out to devices that can only do 32-bit addressing.
This updates commit cce335ae47e2 ("[MIPS] 64-bit Sibyte kernels need DMA32.").
References:
[1] "BCM1250/BCM1125/BCM1125H User Manual", Revision 1250_1125-UM100-R, Broadcom Corporation, 21 Oct 2002, Section 3: "System Overview", "Memory Map", pp. 34-38
[2] "BCM91250A User Manual", Revision 91250A-UM100-R, Broadcom Corporation, 18 May 2004, Section 3: "Physical Description", "Supported DRAM", p. 23
Signed-off-by: Maciej W. Rozycki macro@linux-mips.org [paul.burton@mips.com: Remove GPL text from dma.c; SPDX tag covers it] Signed-off-by: Paul Burton paul.burton@mips.com Reviewed-by: Christoph Hellwig hch@lst.de Patchwork: https://patchwork.linux-mips.org/patch/21108/ References: cce335ae47e2 ("[MIPS] 64-bit Sibyte kernels need DMA32.") Cc: Ralf Baechle ralf@linux-mips.org Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/mips/Kconfig | 3 +++ arch/mips/sibyte/common/Makefile | 1 + arch/mips/sibyte/common/dma.c | 14 ++++++++++++++ 3 files changed, 18 insertions(+) create mode 100644 arch/mips/sibyte/common/dma.c
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 35511999156a..b245e34bd97f 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -794,6 +794,7 @@ config SIBYTE_SWARM select SYS_SUPPORTS_HIGHMEM select SYS_SUPPORTS_LITTLE_ENDIAN select ZONE_DMA32 if 64BIT + select SWIOTLB if ARCH_DMA_ADDR_T_64BIT && PCI
config SIBYTE_LITTLESUR bool "Sibyte BCM91250C2-LittleSur" @@ -814,6 +815,7 @@ config SIBYTE_SENTOSA select SYS_HAS_CPU_SB1 select SYS_SUPPORTS_BIG_ENDIAN select SYS_SUPPORTS_LITTLE_ENDIAN + select SWIOTLB if ARCH_DMA_ADDR_T_64BIT && PCI
config SIBYTE_BIGSUR bool "Sibyte BCM91480B-BigSur" @@ -826,6 +828,7 @@ config SIBYTE_BIGSUR select SYS_SUPPORTS_HIGHMEM select SYS_SUPPORTS_LITTLE_ENDIAN select ZONE_DMA32 if 64BIT + select SWIOTLB if ARCH_DMA_ADDR_T_64BIT && PCI
config SNI_RM bool "SNI RM200/300/400" diff --git a/arch/mips/sibyte/common/Makefile b/arch/mips/sibyte/common/Makefile index b3d6bf23a662..3ef3fb658136 100644 --- a/arch/mips/sibyte/common/Makefile +++ b/arch/mips/sibyte/common/Makefile @@ -1,4 +1,5 @@ obj-y := cfe.o +obj-$(CONFIG_SWIOTLB) += dma.o obj-$(CONFIG_SIBYTE_BUS_WATCHER) += bus_watcher.o obj-$(CONFIG_SIBYTE_CFE_CONSOLE) += cfe_console.o obj-$(CONFIG_SIBYTE_TBPROF) += sb_tbprof.o diff --git a/arch/mips/sibyte/common/dma.c b/arch/mips/sibyte/common/dma.c new file mode 100644 index 000000000000..eb47a94f3583 --- /dev/null +++ b/arch/mips/sibyte/common/dma.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * DMA support for Broadcom SiByte platforms. + * + * Copyright (c) 2018 Maciej W. Rozycki + */ + +#include <linux/swiotlb.h> +#include <asm/bootinfo.h> + +void __init plat_swiotlb_setup(void) +{ + swiotlb_init(1); +}
From: Vitaly Chikunov vt@altlinux.org
[ Upstream commit 3da2c1dfdb802b184eea0653d1e589515b52d74b ]
ecc_point_mult is supposed to be used with a regularized scalar, otherwise, it's possible to deduce the position of the top bit of the scalar with timing attack. This is important when the scalar is a private key.
ecc_point_mult is already using a regular algorithm (i.e. having an operation flow independent of the input scalar) but regularization step is not implemented.
Arrange scalar to always have fixed top bit by adding a multiple of the curve order (n).
References: The constant time regularization step is based on micro-ecc by Kenneth MacKay and also referenced in the literature (Bernstein, D. J., & Lange, T. (2017). Montgomery curves and the Montgomery ladder. (Cryptology ePrint Archive; Vol. 2017/293). s.l.: IACR. Chapter 4.6.2.)
Signed-off-by: Vitaly Chikunov vt@altlinux.org Cc: kernel-hardening@lists.openwall.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org --- crypto/ecc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/crypto/ecc.c b/crypto/ecc.c index 8facafd67802..adcce310f646 100644 --- a/crypto/ecc.c +++ b/crypto/ecc.c @@ -842,15 +842,23 @@ static void xycz_add_c(u64 *x1, u64 *y1, u64 *x2, u64 *y2, u64 *curve_prime,
static void ecc_point_mult(struct ecc_point *result, const struct ecc_point *point, const u64 *scalar, - u64 *initial_z, u64 *curve_prime, + u64 *initial_z, const struct ecc_curve *curve, unsigned int ndigits) { /* R0 and R1 */ u64 rx[2][ECC_MAX_DIGITS]; u64 ry[2][ECC_MAX_DIGITS]; u64 z[ECC_MAX_DIGITS]; + u64 sk[2][ECC_MAX_DIGITS]; + u64 *curve_prime = curve->p; int i, nb; - int num_bits = vli_num_bits(scalar, ndigits); + int num_bits; + int carry; + + carry = vli_add(sk[0], scalar, curve->n, ndigits); + vli_add(sk[1], sk[0], curve->n, ndigits); + scalar = sk[!carry]; + num_bits = sizeof(u64) * ndigits * 8 + 1;
vli_set(rx[1], point->x, ndigits); vli_set(ry[1], point->y, ndigits); @@ -1004,7 +1012,7 @@ int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits, goto out; }
- ecc_point_mult(pk, &curve->g, priv, NULL, curve->p, ndigits); + ecc_point_mult(pk, &curve->g, priv, NULL, curve, ndigits); if (ecc_point_is_zero(pk)) { ret = -EAGAIN; goto err_free_point; @@ -1090,7 +1098,7 @@ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits, goto err_alloc_product; }
- ecc_point_mult(product, pk, priv, rand_z, curve->p, ndigits); + ecc_point_mult(product, pk, priv, rand_z, curve, ndigits);
ecc_swap_digits(product->x, secret, ndigits);
From: Anders Roxell anders.roxell@linaro.org
[ Upstream commit 81e9fa8bab381f8b6eb04df7cdf0f71994099bd4 ]
The armv8_pmuv3 driver doesn't have a remove function, and when the test 'CONFIG_DEBUG_TEST_DRIVER_REMOVE=y' is enabled, the following Call trace can be seen.
[ 1.424287] Failed to register pmu: armv8_pmuv3, reason -17 [ 1.424870] WARNING: CPU: 0 PID: 1 at ../kernel/events/core.c:11771 perf_event_sysfs_init+0x98/0xdc [ 1.425220] Modules linked in: [ 1.425531] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.19.0-rc7-next-20181012-00003-ge7a97b1ad77b-dirty #35 [ 1.425951] Hardware name: linux,dummy-virt (DT) [ 1.426212] pstate: 80000005 (Nzcv daif -PAN -UAO) [ 1.426458] pc : perf_event_sysfs_init+0x98/0xdc [ 1.426720] lr : perf_event_sysfs_init+0x98/0xdc [ 1.426908] sp : ffff00000804bd50 [ 1.427077] x29: ffff00000804bd50 x28: ffff00000934e078 [ 1.427429] x27: ffff000009546000 x26: 0000000000000007 [ 1.427757] x25: ffff000009280710 x24: 00000000ffffffef [ 1.428086] x23: ffff000009408000 x22: 0000000000000000 [ 1.428415] x21: ffff000009136008 x20: ffff000009408730 [ 1.428744] x19: ffff80007b20b400 x18: 000000000000000a [ 1.429075] x17: 0000000000000000 x16: 0000000000000000 [ 1.429418] x15: 0000000000000400 x14: 2e79726f74636572 [ 1.429748] x13: 696420656d617320 x12: 656874206e692065 [ 1.430060] x11: 6d616e20656d6173 x10: 2065687420687469 [ 1.430335] x9 : ffff00000804bd50 x8 : 206e6f7361657220 [ 1.430610] x7 : 2c3376756d705f38 x6 : ffff00000954d7ce [ 1.430880] x5 : 0000000000000000 x4 : 0000000000000000 [ 1.431226] x3 : 0000000000000000 x2 : ffffffffffffffff [ 1.431554] x1 : 4d151327adc50b00 x0 : 0000000000000000 [ 1.431868] Call trace: [ 1.432102] perf_event_sysfs_init+0x98/0xdc [ 1.432382] do_one_initcall+0x6c/0x1a8 [ 1.432637] kernel_init_freeable+0x1bc/0x280 [ 1.432905] kernel_init+0x18/0x160 [ 1.433115] ret_from_fork+0x10/0x18 [ 1.433297] ---[ end trace 27fd415390eb9883 ]---
Rework to set suppress_bind_attrs flag to avoid removing the device when CONFIG_DEBUG_TEST_DRIVER_REMOVE=y, since there's no real reason to remove the armv8_pmuv3 driver.
Cc: Arnd Bergmann arnd@arndb.de Co-developed-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Anders Roxell anders.roxell@linaro.org Signed-off-by: Will Deacon will.deacon@arm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index e213f8e867f6..8a91ac067d44 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1274,6 +1274,7 @@ static struct platform_driver armv8_pmu_driver = { .driver = { .name = ARMV8_PMU_PDEV_NAME, .of_match_table = armv8_pmu_of_device_ids, + .suppress_bind_attrs = true, }, .probe = armv8_pmu_device_probe, };
From: Nikolaj Fogh nikolajfogh@gmail.com
[ Upstream commit 6abd837104a3a8e1cda64fc4d7675f6c3ece9d8b ]
Improve baud-rate generation by using rounding-to-closest instead of truncation in divisor calculation.
Results have been verified by logic analyzer on an FT232RT (232BM) chip. The following table shows the wanted baud rate, the baud rate obtained with the old method (truncation), with the new method (rounding) and the baud rate generated by the windows 10 driver. The numbers in parentheses is the error.
+- Wanted --+------ Old -------+------ New -------+------ Win -------+ | 9600 | 9600 (0.00%) | 9604 (0.05%) | 9605 (0.05%) | | 19200 | 19200 (0.00%) | 19199 (0.01%) | 19198 (0.01%) | | 38400 | 38395 (0.01%) | 38431 (0.08%) | 38394 (0.02%) | | 57600 | 57725 (0.22%) | 57540 (0.10%) | 57673 (0.13%) | | 115200 | 115307 (0.09%) | 115330 (0.11%) | 115320 (0.10%) | | 921600 | 919963 (0.18%) | 920386 (0.13%) | 920810 (0.09%) | | 961200 | 996512 (3.67%) | 956480 (0.49%) | 956937 (0.44%) | +-----------+------------------+------------------+------------------+
The error due to noise in the measurements is in the order of a few tenths of a %. As can be seen, the baud rate is significantly improved for some rates (e.g. 961200), and corresponds to the output given by the windows driver.
The theoretical baud rate has been calculated for all baud rates from 1 to 3M, and as expected, the error is centered around 0, with a triangle shape instead of a sawtooth, so the maximum error is decreased to half.
Signed-off-by: Nikolaj Fogh nikolajfogh@gmail.com [ johan: edit commit message slightly ] Signed-off-by: Johan Hovold johan@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/usb/serial/ftdi_sio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index b5cef322826f..97101f666e07 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1114,7 +1114,7 @@ static unsigned short int ftdi_232am_baud_base_to_divisor(int baud, int base) { unsigned short int divisor; /* divisor shifted 3 bits to the left */ - int divisor3 = base / 2 / baud; + int divisor3 = DIV_ROUND_CLOSEST(base, 2 * baud); if ((divisor3 & 0x7) == 7) divisor3++; /* round x.7/8 up to x+1 */ divisor = divisor3 >> 3; @@ -1140,7 +1140,7 @@ static u32 ftdi_232bm_baud_base_to_divisor(int baud, int base) static const unsigned char divfrac[8] = { 0, 3, 2, 4, 1, 5, 6, 7 }; u32 divisor; /* divisor shifted 3 bits to the left */ - int divisor3 = base / 2 / baud; + int divisor3 = DIV_ROUND_CLOSEST(base, 2 * baud); divisor = divisor3 >> 3; divisor |= (u32)divfrac[divisor3 & 0x7] << 14; /* Deal with special cases for highest baud rates. */ @@ -1163,7 +1163,7 @@ static u32 ftdi_2232h_baud_base_to_divisor(int baud, int base) int divisor3;
/* hi-speed baud rate is 10-bit sampling instead of 16-bit */ - divisor3 = base * 8 / (baud * 10); + divisor3 = DIV_ROUND_CLOSEST(8 * base, 10 * baud);
divisor = divisor3 >> 3; divisor |= (u32)divfrac[divisor3 & 0x7] << 14;
From: Ville Syrjälä ville.syrjala@linux.intel.com
[ Upstream commit 2de42f79bb21a412f40ade8831eb6fc445cb78a4 ]
Consider the following scenario: 1. nonblocking enable crtc 2. wait for the event 3. nonblocking disable crtc
On i915 this can lead to a spurious -EBUSY from step 3 on account of non-enabled planes getting the fake_commit in step 1 and we don't complete the fake_commit-> flip_done until drm_atomic_helper_commit_hw_done() which can happen a long time after the flip event was sent out.
This will become somewhat easy to hit on SKL+ once we start to add all the planes for the crtc to every modeset commit for the purposes of forcing a watermark register programming [1].
To make the race a little less pronounced let's complete fake_commit->flip_done after drm_atomic_helper_wait_for_flip_done(). For the single crtc case this should make the race quite theoretical, assuming drm_atomic_helper_wait_for_flip_done() actually has to wait for the real commit flip_done. In case the real commit flip_done gets completed singificantly before drm_atomic_helper_wait_for_flip_done(), or we are dealing with multiple crtcs whose vblanks don't line up nicely the race still exists.
[1] https://patchwork.freedesktop.org/patch/262670/
Cc: Maarten Lankhorst maarten.lankhorst@linux.intel.com Fixes: 080de2e5be2d ("drm/atomic: Check for busy planes/connectors before setting the commit") Testcase: igt/kms_cursor_legacy/*nonblocking-modeset-vs-cursor-atomic Signed-off-by: Ville Syrjälä ville.syrjala@linux.intel.com Link: https://patchwork.freedesktop.org/patch/msgid/20181122143412.11655-1-ville.s... Reviewed-by: Maarten Lankhorst maarten.lankhorst@linux.intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/drm_atomic_helper.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 1bb4c318bdd4..f77bff5aa307 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -1425,6 +1425,9 @@ void drm_atomic_helper_wait_for_flip_done(struct drm_device *dev, DRM_ERROR("[CRTC:%d:%s] flip_done timed out\n", crtc->base.id, crtc->name); } + + if (old_state->fake_commit) + complete_all(&old_state->fake_commit->flip_done); } EXPORT_SYMBOL(drm_atomic_helper_wait_for_flip_done);
From: Martin Blumenstingl martin.blumenstingl@googlemail.com
[ Upstream commit ad9b2b8e53af61375322e3c7d624acf3a3ef53b0 ]
The public S805 datasheet only mentions that HHI_SYS_CPU_CLK_CNTL1[20:29] contains a divider called "cpu_scale_div". Unfortunately it does not mention how to use the register contents.
The Amlogic 3.10 GPL kernel sources are using the following code to calculate the CPU clock based on that register (taken from arch/arm/mach-meson8/clock.c in the 3.10 Amlogic kernel, shortened to make it easier to read): N = (aml_read_reg32(P_HHI_SYS_CPU_CLK_CNTL1) >> 20) & 0x3FF; if (sel == 3) /* use cpu_scale_div */ div = 2 * N; else div = ... /* not relevant for this example */ cpu_clk = parent_clk / div;
This suggests that the formula is: parent_rate / 2 * register_value However, running perf (which can measure the CPU clock rate thanks to the ARM PMU) shows that this formula is not correct. This can be reproduced with the following steps: 1. boot into u-boot 2. let the CPU clock run off the XTAL clock: mw.l 0xC110419C 0x30 1 3. set the cpu_scale_div register: to value 0x1: mw.l 0xC110415C 0x801016A2 1 to value 0x2: mw.l 0xC110415C 0x802016A2 1 to value 0x5: mw.l 0xC110415C 0x805016A2 1 4. let the CPU clock run off cpu_scale_div: mw.l 0xC110419C 0xbd 1 5. boot Linux 6. run: perf stat -aB stress --cpu 4 --timeout 10 7. check the "cycles" value
I get the following results depending on the cpu_scale_div value: - (cpu_in_sel - this is the input clock for cpu_scale_div - runs at 1.2GHz) - 0x1 = 300MHz - 0x2 = 200MHz - 0x5 = 100MHz
This means that the actual formula to calculate the output of the cpu_scale_div clock is: parent_rate / 2 * (register value + 1).
The register value 0x0 is reserved. When letting the CPU clock run off the cpu_scale_div while the value is 0x0 the whole board hangs (even in u-boot).
I also verified this with the TWD timer: when adding this to the .dts without specifying it's clock it will auto-detect the PERIPH (which is the input clock of the TWD) clock rate (and the result is shown in the kernel log). On Meson8, Meson8b and Meson8m2 the PERIPH clock is CPUCLK divided by 4. This also matched for all three test-cases from above (in all cases the TWD timer clock rate was approx. one fourth of the CPU clock rate).
A small note regarding the "fixes" tag: the original issue seems to exist virtually since forever. Even commit 28b9fcd016126e ("clk: meson8b: Add support for Meson8b clocks") seems to handle this wrong. I still decided to use commit 251b6fd38bcb9c ("clk: meson: rework meson8b cpu clock") because this is the first commit which gets the CPU hiearchy correct and thus it's the first commit where the cpu_scale_div register is used correctly (apart from the bug in the cpu_scale_table).
Fixes: 251b6fd38bcb9c ("clk: meson: rework meson8b cpu clock") Signed-off-by: Martin Blumenstingl martin.blumenstingl@googlemail.com Signed-off-by: Neil Armstrong narmstrong@baylibre.com Link: https://lkml.kernel.org/r/20180927085921.24627-2-martin.blumenstingl@googlem... Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/clk/meson/meson8b.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/drivers/clk/meson/meson8b.c b/drivers/clk/meson/meson8b.c index 74697e145dde..50060e895e7a 100644 --- a/drivers/clk/meson/meson8b.c +++ b/drivers/clk/meson/meson8b.c @@ -568,13 +568,14 @@ static struct clk_fixed_factor meson8b_cpu_div3 = { };
static const struct clk_div_table cpu_scale_table[] = { - { .val = 2, .div = 4 }, - { .val = 3, .div = 6 }, - { .val = 4, .div = 8 }, - { .val = 5, .div = 10 }, - { .val = 6, .div = 12 }, - { .val = 7, .div = 14 }, - { .val = 8, .div = 16 }, + { .val = 1, .div = 4 }, + { .val = 2, .div = 6 }, + { .val = 3, .div = 8 }, + { .val = 4, .div = 10 }, + { .val = 5, .div = 12 }, + { .val = 6, .div = 14 }, + { .val = 7, .div = 16 }, + { .val = 8, .div = 18 }, { /* sentinel */ }, };
From: "Daniel T. Lee" danieltimlee@gmail.com
[ Upstream commit 5a863813216ce79e16a8c1503b2543c528b778b6 ]
Currently, kprobe_events failure won't be handled properly. Due to calling system() indirectly to write to kprobe_events, it can't be identified whether an error is derived from kprobe or system.
// buf = "echo '%c:%s %s' >> /s/k/d/t/kprobe_events" err = system(buf); if (err < 0) { printf("failed to create kprobe .."); return -1; }
For example, running ./tracex7 sample in ext4 partition, "echo p:open_ctree open_ctree >> /s/k/d/t/kprobe_events" gets 256 error code system() failure. => The error comes from kprobe, but it's not handled correctly.
According to man of system(3), it's return value just passes the termination status of the child shell rather than treating the error as -1. (don't care success)
Which means, currently it's not working as desired. (According to the upper code snippet)
ex) running ./tracex7 with ext4 env. # Current Output sh: echo: I/O error failed to open event open_ctree
# Desired Output failed to create kprobe 'open_ctree' error 'No such file or directory'
The problem is, error can't be verified whether from child ps or system. But using write() directly can verify the command failure, and it will treat all error as -1. So I suggest using write() directly to 'kprobe_events' rather than calling system().
Signed-off-by: Daniel T. Lee danieltimlee@gmail.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Sasha Levin sashal@kernel.org --- samples/bpf/bpf_load.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-)
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 904e775d1a44..cf40a8284a38 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -55,6 +55,23 @@ static int populate_prog_array(const char *event, int prog_fd) return 0; }
+static int write_kprobe_events(const char *val) +{ + int fd, ret, flags; + + if ((val != NULL) && (val[0] == '\0')) + flags = O_WRONLY | O_TRUNC; + else + flags = O_WRONLY | O_APPEND; + + fd = open("/sys/kernel/debug/tracing/kprobe_events", flags); + + ret = write(fd, val, strlen(val)); + close(fd); + + return ret; +} + static int load_and_attach(const char *event, struct bpf_insn *prog, int size) { bool is_socket = strncmp(event, "socket", 6) == 0; @@ -166,10 +183,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
#ifdef __x86_64__ if (strncmp(event, "sys_", 4) == 0) { - snprintf(buf, sizeof(buf), - "echo '%c:__x64_%s __x64_%s' >> /sys/kernel/debug/tracing/kprobe_events", - is_kprobe ? 'p' : 'r', event, event); - err = system(buf); + snprintf(buf, sizeof(buf), "%c:__x64_%s __x64_%s", + is_kprobe ? 'p' : 'r', event, event); + err = write_kprobe_events(buf); if (err >= 0) { need_normal_check = false; event_prefix = "__x64_"; @@ -177,10 +193,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) } #endif if (need_normal_check) { - snprintf(buf, sizeof(buf), - "echo '%c:%s %s' >> /sys/kernel/debug/tracing/kprobe_events", - is_kprobe ? 'p' : 'r', event, event); - err = system(buf); + snprintf(buf, sizeof(buf), "%c:%s %s", + is_kprobe ? 'p' : 'r', event, event); + err = write_kprobe_events(buf); if (err < 0) { printf("failed to create kprobe '%s' error '%s'\n", event, strerror(errno)); @@ -520,7 +535,7 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) return 1;
/* clear all kprobes */ - i = system("echo "" > /sys/kernel/debug/tracing/kprobe_events"); + i = write_kprobe_events("");
/* scan over all elf sections to get license and map info */ for (i = 1; i < ehdr.e_shnum; i++) {
From: Yoshihiro Shimoda yoshihiro.shimoda.uh@renesas.com
[ Upstream commit ceb94bc52c437463f0903e61060a94a2226fb672 ]
This patch adds a safety connection way for "forced_b_device" with "workaround_for_vbus" like below:
< Example for R-Car E3 Ebisu > # modprobe <any usb gadget driver> # echo 1 > /sys/kernel/debug/ee020000.usb/b_device (connect a usb cable to host side.) # echo 2 > /sys/kernel/debug/ee020000.usb/b_device
Previous code should have connected a usb cable before the "b_device" is set to 1 on the Ebisu board. However, if xHCI driver on the board is probed, it causes some troubles: - Conflicts USB VBUS/signals between the board and another host. - "Cannot enable. Maybe the USB cable is bad?" might happen on both the board and another host with a usb hub. - Cannot enumerate a usb gadget correctly because an interruption of VBUS change happens unexpectedly.
Reported-by: Kazuya Mizuguchi kazuya.mizuguchi.ks@renesas.com Signed-off-by: Yoshihiro Shimoda yoshihiro.shimoda.uh@renesas.com Signed-off-by: Felipe Balbi felipe.balbi@linux.intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/usb/gadget/udc/renesas_usb3.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 67d8a501d994..fea02c7ad4f4 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -358,6 +358,7 @@ struct renesas_usb3 { bool extcon_host; /* check id and set EXTCON_USB_HOST */ bool extcon_usb; /* check vbus and set EXTCON_USB */ bool forced_b_device; + bool start_to_connect; };
#define gadget_to_renesas_usb3(_gadget) \ @@ -476,7 +477,8 @@ static void usb3_init_axi_bridge(struct renesas_usb3 *usb3) static void usb3_init_epc_registers(struct renesas_usb3 *usb3) { usb3_write(usb3, ~0, USB3_USB_INT_STA_1); - usb3_enable_irq_1(usb3, USB_INT_1_VBUS_CNG); + if (!usb3->workaround_for_vbus) + usb3_enable_irq_1(usb3, USB_INT_1_VBUS_CNG); }
static bool usb3_wakeup_usb2_phy(struct renesas_usb3 *usb3) @@ -700,8 +702,7 @@ static void usb3_mode_config(struct renesas_usb3 *usb3, bool host, bool a_dev) usb3_set_mode_by_role_sw(usb3, host); usb3_vbus_out(usb3, a_dev); /* for A-Peripheral or forced B-device mode */ - if ((!host && a_dev) || - (usb3->workaround_for_vbus && usb3->forced_b_device)) + if ((!host && a_dev) || usb3->start_to_connect) usb3_connect(usb3); spin_unlock_irqrestore(&usb3->lock, flags); } @@ -2432,7 +2433,11 @@ static ssize_t renesas_usb3_b_device_write(struct file *file, if (copy_from_user(&buf, ubuf, min_t(size_t, sizeof(buf) - 1, count))) return -EFAULT;
- if (!strncmp(buf, "1", 1)) + usb3->start_to_connect = false; + if (usb3->workaround_for_vbus && usb3->forced_b_device && + !strncmp(buf, "2", 1)) + usb3->start_to_connect = true; + else if (!strncmp(buf, "1", 1)) usb3->forced_b_device = true; else usb3->forced_b_device = false; @@ -2440,7 +2445,7 @@ static ssize_t renesas_usb3_b_device_write(struct file *file, if (usb3->workaround_for_vbus) usb3_disconnect(usb3);
- /* Let this driver call usb3_connect() anyway */ + /* Let this driver call usb3_connect() if needed */ usb3_check_id(usb3);
return count;
From: Anatolij Gustschin agust@denx.de
[ Upstream commit 30522a951f9d02f261d0697c35cb42205b1fae17 ]
Currently registering CvP managers works only for first probed CvP device, for all other devices it is refused due to duplicated chkcfg sysfs entry:
fpga_manager fpga3: Altera CvP FPGA Manager @0000:0c:00.0 registered sysfs: cannot create duplicate filename '/bus/pci/drivers/altera-cvp/chkcfg' CPU: 0 PID: 3808 Comm: bash Tainted: G O 4.19.0-custom+ #5 Call Trace: dump_stack+0x46/0x5b sysfs_warn_dup+0x53/0x60 sysfs_add_file_mode_ns+0x16d/0x180 sysfs_create_file_ns+0x51/0x60 altera_cvp_probe+0x16f/0x2a0 [altera_cvp] local_pci_probe+0x3f/0xa0 ? pci_match_device+0xb1/0xf0 pci_device_probe+0x116/0x170 really_probe+0x21b/0x2c0 driver_probe_device+0x4b/0xe0 bind_store+0xcb/0x130 kernfs_fop_write+0xfd/0x180 __vfs_write+0x21/0x150 ? selinux_file_permission+0xdc/0x130 vfs_write+0xa8/0x1a0 ? find_vma+0xd/0x60 ksys_write+0x3d/0x90 do_syscall_64+0x44/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ... altera-cvp 0000:0c:00.0: Can't create sysfs chkcfg file fpga_manager fpga3: fpga_mgr_unregister Altera CvP FPGA Manager @0000:0c:00.0
Move chkcfg creation to module init as suggested by Alan.
Signed-off-by: Anatolij Gustschin agust@denx.de Acked-by: Alan Tull atull@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/fpga/altera-cvp.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-)
diff --git a/drivers/fpga/altera-cvp.c b/drivers/fpga/altera-cvp.c index 7fa793672a7a..68e4b2b98c8f 100644 --- a/drivers/fpga/altera-cvp.c +++ b/drivers/fpga/altera-cvp.c @@ -468,14 +468,6 @@ static int altera_cvp_probe(struct pci_dev *pdev, goto err_unmap; }
- ret = driver_create_file(&altera_cvp_driver.driver, - &driver_attr_chkcfg); - if (ret) { - dev_err(&pdev->dev, "Can't create sysfs chkcfg file\n"); - fpga_mgr_unregister(mgr); - goto err_unmap; - } - return 0;
err_unmap: @@ -493,7 +485,6 @@ static void altera_cvp_remove(struct pci_dev *pdev) struct altera_cvp_conf *conf = mgr->priv; u16 cmd;
- driver_remove_file(&altera_cvp_driver.driver, &driver_attr_chkcfg); fpga_mgr_unregister(mgr); pci_iounmap(pdev, conf->map); pci_release_region(pdev, CVP_BAR); @@ -502,7 +493,30 @@ static void altera_cvp_remove(struct pci_dev *pdev) pci_write_config_word(pdev, PCI_COMMAND, cmd); }
-module_pci_driver(altera_cvp_driver); +static int __init altera_cvp_init(void) +{ + int ret; + + ret = pci_register_driver(&altera_cvp_driver); + if (ret) + return ret; + + ret = driver_create_file(&altera_cvp_driver.driver, + &driver_attr_chkcfg); + if (ret) + pr_warn("Can't create sysfs chkcfg file\n"); + + return 0; +} + +static void __exit altera_cvp_exit(void) +{ + driver_remove_file(&altera_cvp_driver.driver, &driver_attr_chkcfg); + pci_unregister_driver(&altera_cvp_driver); +} + +module_init(altera_cvp_init); +module_exit(altera_cvp_exit);
MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Anatolij Gustschin agust@denx.de");
From: Ondrej Mosnacek omosnace@redhat.com
[ Upstream commit 2cbdcb882f97a45f7475c67ac6257bbc16277dfe ]
If a superblock has the MS_SUBMOUNT flag set, we should always allow mounting it. These mounts are done automatically by the kernel either as part of mounting some parent mount (e.g. debugfs always mounts tracefs under "tracing" for compatibility) or they are mounted automatically as needed on subdirectory accesses (e.g. NFS crossmnt mounts). Since such automounts are either an implicit consequence of the parent mount (which is already checked) or they can happen during regular accesses (where it doesn't make sense to check against the current task's context), the mount permission check should be skipped for them.
Without this patch, attempts to access contents of an automounted directory can cause unexpected SELinux denials.
In the current kernel tree, the MS_SUBMOUNT flag is set only via vfs_submount(), which is called only from the following places: - AFS, when automounting special "symlinks" referencing other cells - CIFS, when automounting "referrals" - NFS, when automounting subtrees - debugfs, when automounting tracefs
In all cases the submounts are meant to be transparent to the user and it makes sense that if mounting the master is allowed, then so should be the automounts. Note that CAP_SYS_ADMIN capability checking is already skipped for (SB_KERNMOUNT|SB_SUBMOUNT) in: - sget_userns() in fs/super.c: if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !(type->fs_flags & FS_USERNS_MOUNT) && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - sget() in fs/super.c: /* Ensure the requestor has permissions over the target filesystem */ if (!(flags & (SB_KERNMOUNT|SB_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM);
Verified internally on patched RHEL 7.6 with a reproducer using NFS+httpd and selinux-tesuite.
Fixes: 93faccbbfa95 ("fs: Better permission checking for submounts") Signed-off-by: Ondrej Mosnacek omosnace@redhat.com Signed-off-by: Paul Moore paul@paul-moore.com Signed-off-by: Sasha Levin sashal@kernel.org --- security/selinux/hooks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index fe251c6f09f1..3c3878f0d2fa 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2934,7 +2934,7 @@ static int selinux_sb_kern_mount(struct super_block *sb, int flags, void *data) return rc;
/* Allow all mounts performed by the kernel */ - if (flags & MS_KERNMOUNT) + if (flags & (MS_KERNMOUNT | MS_SUBMOUNT)) return 0;
ad.type = LSM_AUDIT_DATA_DENTRY;
From: Nicholas Kazlauskas nicholas.kazlauskas@amd.com
[ Upstream commit 520f08df45fbe300ed650da786a74093d658b7e1 ]
When variable refresh rate is active the hardware counter can return a position >= vtotal. This results in a vpos being returned from amdgpu_display_get_crtc_scanoutpos that's a positive value. The positive value indicates to the caller that the display is currently in scanout when the display is actually still in vblank.
This is because the vfront porch duration is unknown with variable refresh active and will end when either a page flip occurs or the timeout specified by the driver/display is reached.
The behavior of the amdgpu_display_get_crtc_scanoutpos remains the same when the position is below vtotal. When the position is above vtotal the function will return a value that is effectively -vbl_end, the size of the vback porch.
The only caller affected by this change is the DRM helper for calculating vblank timestamps. This change corrects behavior for calculating the page flip timestamp from being the previous timestamp to the calculation to the next timestamp when position >= vtotal.
Signed-off-by: Nicholas Kazlauskas nicholas.kazlauskas@amd.com Reviewed-by: Harry Wentland harry.wentland@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 686a26de50f9..9c940bbed608 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -857,7 +857,12 @@ int amdgpu_display_get_crtc_scanoutpos(struct drm_device *dev, /* Inside "upper part" of vblank area? Apply corrective offset if so: */ if (in_vbl && (*vpos >= vbl_start)) { vtotal = mode->crtc_vtotal; - *vpos = *vpos - vtotal; + + /* With variable refresh rate displays the vpos can exceed + * the vtotal value. Clamp to 0 to return -vbl_end instead + * of guessing the remaining number of lines until scanout. + */ + *vpos = (*vpos < vtotal) ? (*vpos - vtotal) : 0; }
/* Correct for shifted end of vbl at vbl_end. */
From: Yufen Yu yuyufen@huawei.com
[ Upstream commit 94a2c3a32b62e868dc1e3d854326745a7f1b8c7a ]
We recently got a stack by syzkaller like this:
BUG: sleeping function called from invalid context at mm/slab.h:361 in_atomic(): 1, irqs_disabled(): 0, pid: 6644, name: blkid INFO: lockdep is turned off. CPU: 1 PID: 6644 Comm: blkid Not tainted 4.4.163-514.55.6.9.x86_64+ #76 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 0000000000000000 5ba6a6b879e50c00 ffff8801f6b07b10 ffffffff81cb2194 0000000041b58ab3 ffffffff833c7745 ffffffff81cb2080 5ba6a6b879e50c00 0000000000000000 0000000000000001 0000000000000004 0000000000000000 Call Trace: <IRQ> [<ffffffff81cb2194>] __dump_stack lib/dump_stack.c:15 [inline] <IRQ> [<ffffffff81cb2194>] dump_stack+0x114/0x1a0 lib/dump_stack.c:51 [<ffffffff8129a981>] ___might_sleep+0x291/0x490 kernel/sched/core.c:7675 [<ffffffff8129ac33>] __might_sleep+0xb3/0x270 kernel/sched/core.c:7637 [<ffffffff81794c13>] slab_pre_alloc_hook mm/slab.h:361 [inline] [<ffffffff81794c13>] slab_alloc_node mm/slub.c:2610 [inline] [<ffffffff81794c13>] slab_alloc mm/slub.c:2692 [inline] [<ffffffff81794c13>] kmem_cache_alloc_trace+0x2c3/0x5c0 mm/slub.c:2709 [<ffffffff81cbe9a7>] kmalloc include/linux/slab.h:479 [inline] [<ffffffff81cbe9a7>] kzalloc include/linux/slab.h:623 [inline] [<ffffffff81cbe9a7>] kobject_uevent_env+0x2c7/0x1150 lib/kobject_uevent.c:227 [<ffffffff81cbf84f>] kobject_uevent+0x1f/0x30 lib/kobject_uevent.c:374 [<ffffffff81cbb5b9>] kobject_cleanup lib/kobject.c:633 [inline] [<ffffffff81cbb5b9>] kobject_release+0x229/0x440 lib/kobject.c:675 [<ffffffff81cbb0a2>] kref_sub include/linux/kref.h:73 [inline] [<ffffffff81cbb0a2>] kref_put include/linux/kref.h:98 [inline] [<ffffffff81cbb0a2>] kobject_put+0x72/0xd0 lib/kobject.c:692 [<ffffffff8216f095>] put_device+0x25/0x30 drivers/base/core.c:1237 [<ffffffff81c4cc34>] delete_partition_rcu_cb+0x1d4/0x2f0 block/partition-generic.c:232 [<ffffffff813c08bc>] __rcu_reclaim kernel/rcu/rcu.h:118 [inline] [<ffffffff813c08bc>] rcu_do_batch kernel/rcu/tree.c:2705 [inline] [<ffffffff813c08bc>] invoke_rcu_callbacks kernel/rcu/tree.c:2973 [inline] [<ffffffff813c08bc>] __rcu_process_callbacks kernel/rcu/tree.c:2940 [inline] [<ffffffff813c08bc>] rcu_process_callbacks+0x59c/0x1c70 kernel/rcu/tree.c:2957 [<ffffffff8120f509>] __do_softirq+0x299/0xe20 kernel/softirq.c:273 [<ffffffff81210496>] invoke_softirq kernel/softirq.c:350 [inline] [<ffffffff81210496>] irq_exit+0x216/0x2c0 kernel/softirq.c:391 [<ffffffff82c2cd7b>] exiting_irq arch/x86/include/asm/apic.h:652 [inline] [<ffffffff82c2cd7b>] smp_apic_timer_interrupt+0x8b/0xc0 arch/x86/kernel/apic/apic.c:926 [<ffffffff82c2bc25>] apic_timer_interrupt+0xa5/0xb0 arch/x86/entry/entry_64.S:746 <EOI> [<ffffffff814cbf40>] ? audit_kill_trees+0x180/0x180 [<ffffffff8187d2f7>] fd_install+0x57/0x80 fs/file.c:626 [<ffffffff8180989e>] do_sys_open+0x45e/0x550 fs/open.c:1043 [<ffffffff818099c2>] SYSC_open fs/open.c:1055 [inline] [<ffffffff818099c2>] SyS_open+0x32/0x40 fs/open.c:1050 [<ffffffff82c299e1>] entry_SYSCALL_64_fastpath+0x1e/0x9a
In softirq context, we call rcu callback function delete_partition_rcu_cb(), which may allocate memory by kzalloc with GFP_KERNEL flag. If the allocation cannot be satisfied, it may sleep. However, That is not allowed in softirq contex.
Although we found this problem on linux 4.4, the latest kernel version seems to have this problem as well. And it is very similar to the previous one: https://lkml.org/lkml/2018/7/9/391
Fix it by using RCU workqueue, which allows sleep.
Reviewed-by: Paul E. McKenney paulmck@linux.ibm.com Signed-off-by: Yufen Yu yuyufen@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- block/partition-generic.c | 8 +++++--- include/linux/genhd.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/block/partition-generic.c b/block/partition-generic.c index d3d14e81fb12..5f8db5c5140f 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -249,9 +249,10 @@ struct device_type part_type = { .uevent = part_uevent, };
-static void delete_partition_rcu_cb(struct rcu_head *head) +static void delete_partition_work_fn(struct work_struct *work) { - struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); + struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct, + rcu_work);
part->start_sect = 0; part->nr_sects = 0; @@ -262,7 +263,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head) void __delete_partition(struct percpu_ref *ref) { struct hd_struct *part = container_of(ref, struct hd_struct, ref); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn); + queue_rcu_work(system_wq, &part->rcu_work); }
/* diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 25c08c6c7f99..f767293b00e6 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -129,7 +129,7 @@ struct hd_struct { struct disk_stats dkstats; #endif struct percpu_ref ref; - struct rcu_head rcu_head; + struct rcu_work rcu_work; };
#define GENHD_FL_REMOVABLE 1
From: Jiada Wang jiada_wang@mentor.com
[ Upstream commit 489db5d941500249583ec6b49fa70e006bd8f632 ]
pcm3168 codec support runtime_[resume|suspend], whenever it is not active, it enters suspend mode, and it's clock and regulators will be disabled. so there is no need to disable them again in remove callback. Otherwise we got following kernel warnings, when unload pcm3168a driver
[ 222.257514] unbalanced disables for amp-en-regulator [ 222.262526] ------------[ cut here ]------------ [ 222.267158] WARNING: CPU: 0 PID: 2423 at drivers/regulator/core.c:2264 _regulator_disable+0x28/0x108 [ 222.276291] Modules linked in: [ 222.279343] snd_soc_pcm3168a_i2c(-) [ 222.282916] snd_aloop [ 222.285272] arc4 [ 222.287194] wl18xx [ 222.289289] wlcore [ 222.291385] mac80211 [ 222.293654] cfg80211 [ 222.295923] aes_ce_blk [ 222.298366] crypto_simd [ 222.300896] cryptd [ 222.302992] aes_ce_cipher [ 222.305696] crc32_ce [ 222.307965] ghash_ce [ 222.310234] aes_arm64 [ 222.312590] gf128mul [ 222.314860] snd_soc_rcar [ 222.317476] sha2_ce [ 222.319658] xhci_plat_hcd [ 222.322362] sha256_arm64 [ 222.324978] xhci_hcd [ 222.327247] sha1_ce [ 222.329430] renesas_usbhs [ 222.332133] evdev [ 222.334142] sha1_generic [ 222.336758] rcar_gen3_thermal [ 222.339810] cpufreq_dt [ 222.342253] ravb_streaming(C) [ 222.345304] wlcore_sdio [ 222.347834] thermal_sys [ 222.350363] udc_core [ 222.352632] mch_core(C) [ 222.355161] usb_dmac [ 222.357430] snd_soc_pcm3168a [ 222.360394] snd_soc_ak4613 [ 222.363184] gpio_keys [ 222.365540] virt_dma [ 222.367809] nfsd [ 222.369730] ipv6 [ 222.371652] autofs4 [ 222.373834] [last unloaded: snd_soc_pcm3168a_i2c] [ 222.378629] CPU: 0 PID: 2423 Comm: rmmod Tainted: G WC 4.14.63-04798-gd456126e4a42-dirty #457 [ 222.388196] Hardware name: Renesas H3ULCB Kingfisher board based on r8a7795 ES2.0+ (DT) [ 222.396199] task: ffff8006fa8c6200 task.stack: ffff00000a0a0000 [ 222.402117] PC is at _regulator_disable+0x28/0x108 [ 222.406906] LR is at _regulator_disable+0x28/0x108 [ 222.411695] pc : [<ffff0000083bd89c>] lr : [<ffff0000083bd89c>] pstate: 00000145 [ 222.419089] sp : ffff00000a0a3c80 [ 222.422401] x29: ffff00000a0a3c80 [ 222.425799] x28: ffff8006fa8c6200 [ 222.429199] x27: ffff0000086f1000 [ 222.432597] x26: 000000000000006a [ 222.435997] x25: 0000000000000124 [ 222.439395] x24: 0000000000000018 [ 222.442795] x23: 0000000000000006 [ 222.446193] x22: ffff8006f925d490 [ 222.449592] x21: ffff8006f9ac2068 [ 222.452991] x20: ffff8006f9ac2000 [ 222.456390] x19: 0000000000000005 [ 222.459787] x18: 000000000000000a [ 222.463186] x17: 0000000000000000 [ 222.466584] x16: 0000000000000000 [ 222.469984] x15: 000000000d3f616a [ 222.473382] x14: 0720072007200720 [ 222.476781] x13: 0720072007200720 [ 222.480179] x12: 0720072007200720 [ 222.483578] x11: 0720072007200720 [ 222.486975] x10: 0720072007200720 [ 222.490375] x9 : 0720072007200720 [ 222.493773] x8 : 07200772076f0774 [ 222.497172] x7 : 0000000000000000 [ 222.500570] x6 : 0000000000000007 [ 222.503969] x5 : 0000000000000000 [ 222.507367] x4 : 0000000000000000 [ 222.510766] x3 : 0000000000000000 [ 222.514164] x2 : c790b852091e2600 [ 222.517563] x1 : 0000000000000000 [ 222.520961] x0 : 0000000000000028 [ 222.524361] Call trace: [ 222.526805] Exception stack(0xffff00000a0a3b40 to 0xffff00000a0a3c80) [ 222.533245] 3b40: 0000000000000028 0000000000000000 c790b852091e2600 0000000000000000 [ 222.541075] 3b60: 0000000000000000 0000000000000000 0000000000000007 0000000000000000 [ 222.548905] 3b80: 07200772076f0774 0720072007200720 0720072007200720 0720072007200720 [ 222.556735] 3ba0: 0720072007200720 0720072007200720 0720072007200720 000000000d3f616a [ 222.564564] 3bc0: 0000000000000000 0000000000000000 000000000000000a 0000000000000005 [ 222.572394] 3be0: ffff8006f9ac2000 ffff8006f9ac2068 ffff8006f925d490 0000000000000006 [ 222.580224] 3c00: 0000000000000018 0000000000000124 000000000000006a ffff0000086f1000 [ 222.588053] 3c20: ffff8006fa8c6200 ffff00000a0a3c80 ffff0000083bd89c ffff00000a0a3c80 [ 222.595883] 3c40: ffff0000083bd89c 0000000000000145 0000000000000000 0000000000000000 [ 222.603713] 3c60: 0000ffffffffffff ffff00000a0a3c30 ffff00000a0a3c80 ffff0000083bd89c [ 222.611543] [<ffff0000083bd89c>] _regulator_disable+0x28/0x108 [ 222.617375] [<ffff0000083bd9c4>] regulator_disable+0x48/0x68 [ 222.623033] [<ffff0000083be8e4>] regulator_bulk_disable+0x58/0xc0 [ 222.629134] [<ffff0000007d831c>] pcm3168a_remove+0x30/0x50 [snd_soc_pcm3168a] [ 222.636270] [<ffff0000007e5010>] pcm3168a_i2c_remove+0x10/0x1c [snd_soc_pcm3168a_i2c] [ 222.644106] [<ffff0000084b9d9c>] i2c_device_remove+0x38/0x70 [ 222.649766] [<ffff00000843cd5c>] device_release_driver_internal+0xd0/0x1c0 [ 222.656640] [<ffff00000843ced8>] driver_detach+0x70/0x7c [ 222.661951] [<ffff00000843bf68>] bus_remove_driver+0x74/0xa0 [ 222.667609] [<ffff00000843d7e4>] driver_unregister+0x48/0x4c [ 222.673268] [<ffff0000084ba8dc>] i2c_del_driver+0x24/0x30 [ 222.678666] [<ffff0000007e5078>] pcm3168a_i2c_driver_exit+0x10/0xf98 [snd_soc_pcm3168a_i2c] [ 222.687019] [<ffff00000811bd28>] SyS_delete_module+0x198/0x1d4 [ 222.692850] Exception stack(0xffff00000a0a3ec0 to 0xffff00000a0a4000) [ 222.699289] 3ec0: 0000aaaafeb4b268 0000000000000800 14453f6470497100 0000fffffaa520d8 [ 222.707119] 3ee0: 0000fffffaa520d9 000000000000000a 1999999999999999 0000000000000000 [ 222.714948] 3f00: 000000000000006a 0000ffffa8f7d1d8 000000000000000a 0000000000000005 [ 222.722778] 3f20: 0000000000000000 0000000000000000 000000000000002d 0000000000000000 [ 222.730607] 3f40: 0000aaaae19b9f68 0000ffffa8f411f0 0000000000000000 0000aaaae19b9000 [ 222.738436] 3f60: 0000fffffaa533b8 0000fffffaa531f0 0000000000000000 0000000000000001 [ 222.746266] 3f80: 0000fffffaa53ec6 0000000000000000 0000aaaafeb4b200 0000aaaafeb4a010 [ 222.754096] 3fa0: 0000000000000000 0000fffffaa53130 0000aaaae199f36c 0000fffffaa53130 [ 222.761926] 3fc0: 0000ffffa8f411f8 0000000000000000 0000aaaafeb4b268 000000000000006a [ 222.769755] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 222.777589] [<ffff0000080832c0>] el0_svc_naked+0x34/0x38 [ 222.782899] ---[ end trace eaf8939a3698b1a8 ]--- [ 222.787609] Failed to disable VCCDA2: -5 [ 222.791649] ------------[ cut here ]------------ [ 222.796283] WARNING: CPU: 0 PID: 2423 at drivers/clk/clk.c:595 clk_core_disable+0xc/0x1d8 [ 222.804460] Modules linked in: [ 222.807511] snd_soc_pcm3168a_i2c(-) [ 222.811083] snd_aloop [ 222.813439] arc4 [ 222.815360] wl18xx [ 222.817456] wlcore [ 222.819551] mac80211 [ 222.821820] cfg80211 [ 222.824088] aes_ce_blk [ 222.826531] crypto_simd [ 222.829060] cryptd [ 222.831155] aes_ce_cipher [ 222.833859] crc32_ce [ 222.836127] ghash_ce [ 222.838396] aes_arm64 [ 222.840752] gf128mul [ 222.843020] snd_soc_rcar [ 222.845637] sha2_ce [ 222.847818] xhci_plat_hcd [ 222.850522] sha256_arm64 [ 222.853138] xhci_hcd [ 222.855407] sha1_ce [ 222.857589] renesas_usbhs [ 222.860292] evdev [ 222.862300] sha1_generic [ 222.864917] rcar_gen3_thermal [ 222.867968] cpufreq_dt [ 222.870410] ravb_streaming(C) [ 222.873461] wlcore_sdio [ 222.875991] thermal_sys [ 222.878520] udc_core [ 222.880789] mch_core(C) [ 222.883318] usb_dmac [ 222.885587] snd_soc_pcm3168a [ 222.888551] snd_soc_ak4613 [ 222.891341] gpio_keys [ 222.893696] virt_dma [ 222.895965] nfsd [ 222.897886] ipv6 [ 222.899808] autofs4 [ 222.901990] [last unloaded: snd_soc_pcm3168a_i2c] [ 222.906783] CPU: 0 PID: 2423 Comm: rmmod Tainted: G WC 4.14.63-04798-gd456126e4a42-dirty #457 [ 222.916349] Hardware name: Renesas H3ULCB Kingfisher board based on r8a7795 ES2.0+ (DT) [ 222.924351] task: ffff8006fa8c6200 task.stack: ffff00000a0a0000 [ 222.930270] PC is at clk_core_disable+0xc/0x1d8 [ 222.934799] LR is at clk_core_disable_lock+0x20/0x34 [ 222.939761] pc : [<ffff0000083ab9b8>] lr : [<ffff0000083acd28>] pstate: 800001c5 [ 222.947154] sp : ffff00000a0a3cf0 [ 222.950466] x29: ffff00000a0a3cf0 [ 222.953864] x28: ffff8006fa8c6200 [ 222.957263] x27: ffff0000086f1000 [ 222.960661] x26: 000000000000006a [ 222.964061] x25: 0000000000000124 [ 222.967458] x24: 0000000000000015 [ 222.970858] x23: ffff8006f9ffa8d0 [ 222.974256] x22: ffff8006faf16480 [ 222.977655] x21: ffff0000007e7040 [ 222.981053] x20: ffff8006faadd100 [ 222.984452] x19: 0000000000000140 [ 222.987850] x18: 000000000000000a [ 222.991249] x17: 0000000000000000 [ 222.994647] x16: 0000000000000000 [ 222.998046] x15: 000000000d477819 [ 223.001444] x14: 0720072007200720 [ 223.004843] x13: 0720072007200720 [ 223.008242] x12: 0720072007200720 [ 223.011641] x11: 0720072007200720 [ 223.015039] x10: 0720072007200720 [ 223.018438] x9 : 0720072007200720 [ 223.021837] x8 : 0720072007200720 [ 223.025236] x7 : 0000000000000000 [ 223.028634] x6 : 0000000000000007 [ 223.032034] x5 : 0000000000000000 [ 223.035432] x4 : 0000000000000000 [ 223.038831] x3 : 0000000000000000 [ 223.042229] x2 : 0000000004720471 [ 223.045628] x1 : 0000000000000000 [ 223.049026] x0 : ffff8006faadd100 [ 223.052426] Call trace: [ 223.054870] Exception stack(0xffff00000a0a3bb0 to 0xffff00000a0a3cf0) [ 223.061309] 3ba0: ffff8006faadd100 0000000000000000 [ 223.069139] 3bc0: 0000000004720471 0000000000000000 0000000000000000 0000000000000000 [ 223.076969] 3be0: 0000000000000007 0000000000000000 0720072007200720 0720072007200720 [ 223.084798] 3c00: 0720072007200720 0720072007200720 0720072007200720 0720072007200720 [ 223.092628] 3c20: 0720072007200720 000000000d477819 0000000000000000 0000000000000000 [ 223.100458] 3c40: 000000000000000a 0000000000000140 ffff8006faadd100 ffff0000007e7040 [ 223.108287] 3c60: ffff8006faf16480 ffff8006f9ffa8d0 0000000000000015 0000000000000124 [ 223.116117] 3c80: 000000000000006a ffff0000086f1000 ffff8006fa8c6200 ffff00000a0a3cf0 [ 223.123947] 3ca0: ffff0000083acd28 ffff00000a0a3cf0 ffff0000083ab9b8 00000000800001c5 [ 223.131777] 3cc0: ffff00000a0a3cf0 ffff0000083acd1c 0000ffffffffffff ffff8006faadd100 [ 223.139606] 3ce0: ffff00000a0a3cf0 ffff0000083ab9b8 [ 223.144483] [<ffff0000083ab9b8>] clk_core_disable+0xc/0x1d8 [ 223.150054] [<ffff0000083acd58>] clk_disable+0x1c/0x28 [ 223.155198] [<ffff0000007d8328>] pcm3168a_remove+0x3c/0x50 [snd_soc_pcm3168a] [ 223.162334] [<ffff0000007e5010>] pcm3168a_i2c_remove+0x10/0x1c [snd_soc_pcm3168a_i2c] [ 223.170167] [<ffff0000084b9d9c>] i2c_device_remove+0x38/0x70 [ 223.175826] [<ffff00000843cd5c>] device_release_driver_internal+0xd0/0x1c0 [ 223.182700] [<ffff00000843ced8>] driver_detach+0x70/0x7c [ 223.188012] [<ffff00000843bf68>] bus_remove_driver+0x74/0xa0 [ 223.193669] [<ffff00000843d7e4>] driver_unregister+0x48/0x4c [ 223.199329] [<ffff0000084ba8dc>] i2c_del_driver+0x24/0x30 [ 223.204726] [<ffff0000007e5078>] pcm3168a_i2c_driver_exit+0x10/0xf98 [snd_soc_pcm3168a_i2c] [ 223.213079] [<ffff00000811bd28>] SyS_delete_module+0x198/0x1d4 [ 223.218909] Exception stack(0xffff00000a0a3ec0 to 0xffff00000a0a4000) [ 223.225349] 3ec0: 0000aaaafeb4b268 0000000000000800 14453f6470497100 0000fffffaa520d8 [ 223.233179] 3ee0: 0000fffffaa520d9 000000000000000a 1999999999999999 0000000000000000 [ 223.241008] 3f00: 000000000000006a 0000ffffa8f7d1d8 000000000000000a 0000000000000005 [ 223.248838] 3f20: 0000000000000000 0000000000000000 000000000000002d 0000000000000000 [ 223.256668] 3f40: 0000aaaae19b9f68 0000ffffa8f411f0 0000000000000000 0000aaaae19b9000 [ 223.264497] 3f60: 0000fffffaa533b8 0000fffffaa531f0 0000000000000000 0000000000000001 [ 223.272327] 3f80: 0000fffffaa53ec6 0000000000000000 0000aaaafeb4b200 0000aaaafeb4a010 [ 223.280157] 3fa0: 0000000000000000 0000fffffaa53130 0000aaaae199f36c 0000fffffaa53130 [ 223.287986] 3fc0: 0000ffffa8f411f8 0000000000000000 0000aaaafeb4b268 000000000000006a [ 223.295816] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 223.303648] [<ffff0000080832c0>] el0_svc_naked+0x34/0x38 [ 223.308958] ---[ end trace eaf8939a3698b1a9 ]--- [ 223.313752] ------------[ cut here ]------------ [ 223.318383] WARNING: CPU: 0 PID: 2423 at drivers/clk/clk.c:477 clk_core_unprepare+0xc/0x1ac [ 223.326733] Modules linked in: [ 223.329784] snd_soc_pcm3168a_i2c(-) [ 223.333356] snd_aloop [ 223.335712] arc4 [ 223.337633] wl18xx [ 223.339728] wlcore [ 223.341823] mac80211 [ 223.344092] cfg80211 [ 223.346360] aes_ce_blk [ 223.348803] crypto_simd [ 223.351332] cryptd [ 223.353428] aes_ce_cipher [ 223.356131] crc32_ce [ 223.358400] ghash_ce [ 223.360668] aes_arm64 [ 223.363024] gf128mul [ 223.365293] snd_soc_rcar [ 223.367909] sha2_ce [ 223.370091] xhci_plat_hcd [ 223.372794] sha256_arm64 [ 223.375410] xhci_hcd [ 223.377679] sha1_ce [ 223.379861] renesas_usbhs [ 223.382564] evdev [ 223.384572] sha1_generic [ 223.387188] rcar_gen3_thermal [ 223.390239] cpufreq_dt [ 223.392682] ravb_streaming(C) [ 223.395732] wlcore_sdio [ 223.398261] thermal_sys [ 223.400790] udc_core [ 223.403059] mch_core(C) [ 223.405588] usb_dmac [ 223.407856] snd_soc_pcm3168a [ 223.410820] snd_soc_ak4613 [ 223.413609] gpio_keys [ 223.415965] virt_dma [ 223.418234] nfsd [ 223.420155] ipv6 [ 223.422076] autofs4 [ 223.424258] [last unloaded: snd_soc_pcm3168a_i2c] [ 223.429050] CPU: 0 PID: 2423 Comm: rmmod Tainted: G WC 4.14.63-04798-gd456126e4a42-dirty #457 [ 223.438616] Hardware name: Renesas H3ULCB Kingfisher board based on r8a7795 ES2.0+ (DT) [ 223.446618] task: ffff8006fa8c6200 task.stack: ffff00000a0a0000 [ 223.452536] PC is at clk_core_unprepare+0xc/0x1ac [ 223.457239] LR is at clk_unprepare+0x28/0x3c [ 223.461506] pc : [<ffff0000083ab5a4>] lr : [<ffff0000083ace4c>] pstate: 60000145 [ 223.468900] sp : ffff00000a0a3d00 [ 223.472211] x29: ffff00000a0a3d00 [ 223.475609] x28: ffff8006fa8c6200 [ 223.479009] x27: ffff0000086f1000 [ 223.482407] x26: 000000000000006a [ 223.485807] x25: 0000000000000124 [ 223.489205] x24: 0000000000000015 [ 223.492604] x23: ffff8006f9ffa8d0 [ 223.496003] x22: ffff8006faf16480 [ 223.499402] x21: ffff0000007e7040 [ 223.502800] x20: ffff8006faf16420 [ 223.506199] x19: ffff8006faadd100 [ 223.509597] x18: 000000000000000a [ 223.512997] x17: 0000000000000000 [ 223.516395] x16: 0000000000000000 [ 223.519794] x15: 0000000000000000 [ 223.523192] x14: 00000033fe89076c [ 223.526591] x13: 0000000000000400 [ 223.529989] x12: 0000000000000400 [ 223.533388] x11: 0000000000000000 [ 223.536786] x10: 00000000000009e0 [ 223.540185] x9 : ffff00000a0a3be0 [ 223.543583] x8 : ffff8006fa8c6c40 [ 223.546982] x7 : ffff8006fa8c6400 [ 223.550380] x6 : 0000000000000001 [ 223.553780] x5 : 0000000000000000 [ 223.557178] x4 : ffff8006fa8c6200 [ 223.560577] x3 : 0000000000000000 [ 223.563975] x2 : ffff8006fa8c6200 [ 223.567374] x1 : 0000000000000000 [ 223.570772] x0 : ffff8006faadd100 [ 223.574170] Call trace: [ 223.576615] Exception stack(0xffff00000a0a3bc0 to 0xffff00000a0a3d00) [ 223.583054] 3bc0: ffff8006faadd100 0000000000000000 ffff8006fa8c6200 0000000000000000 [ 223.590884] 3be0: ffff8006fa8c6200 0000000000000000 0000000000000001 ffff8006fa8c6400 [ 223.598714] 3c00: ffff8006fa8c6c40 ffff00000a0a3be0 00000000000009e0 0000000000000000 [ 223.606544] 3c20: 0000000000000400 0000000000000400 00000033fe89076c 0000000000000000 [ 223.614374] 3c40: 0000000000000000 0000000000000000 000000000000000a ffff8006faadd100 [ 223.622204] 3c60: ffff8006faf16420 ffff0000007e7040 ffff8006faf16480 ffff8006f9ffa8d0 [ 223.630033] 3c80: 0000000000000015 0000000000000124 000000000000006a ffff0000086f1000 [ 223.637863] 3ca0: ffff8006fa8c6200 ffff00000a0a3d00 ffff0000083ace4c ffff00000a0a3d00 [ 223.645693] 3cc0: ffff0000083ab5a4 0000000060000145 0000000000000140 ffff8006faadd100 [ 223.653523] 3ce0: 0000ffffffffffff ffff0000083ace44 ffff00000a0a3d00 ffff0000083ab5a4 [ 223.661353] [<ffff0000083ab5a4>] clk_core_unprepare+0xc/0x1ac [ 223.667103] [<ffff0000007d8330>] pcm3168a_remove+0x44/0x50 [snd_soc_pcm3168a] [ 223.674239] [<ffff0000007e5010>] pcm3168a_i2c_remove+0x10/0x1c [snd_soc_pcm3168a_i2c] [ 223.682070] [<ffff0000084b9d9c>] i2c_device_remove+0x38/0x70 [ 223.687731] [<ffff00000843cd5c>] device_release_driver_internal+0xd0/0x1c0 [ 223.694604] [<ffff00000843ced8>] driver_detach+0x70/0x7c [ 223.699915] [<ffff00000843bf68>] bus_remove_driver+0x74/0xa0 [ 223.705572] [<ffff00000843d7e4>] driver_unregister+0x48/0x4c [ 223.711230] [<ffff0000084ba8dc>] i2c_del_driver+0x24/0x30 [ 223.716628] [<ffff0000007e5078>] pcm3168a_i2c_driver_exit+0x10/0xf98 [snd_soc_pcm3168a_i2c] [ 223.724980] [<ffff00000811bd28>] SyS_delete_module+0x198/0x1d4 [ 223.730811] Exception stack(0xffff00000a0a3ec0 to 0xffff00000a0a4000) [ 223.737250] 3ec0: 0000aaaafeb4b268 0000000000000800 14453f6470497100 0000fffffaa520d8 [ 223.745079] 3ee0: 0000fffffaa520d9 000000000000000a 1999999999999999 0000000000000000 [ 223.752909] 3f00: 000000000000006a 0000ffffa8f7d1d8 000000000000000a 0000000000000005 [ 223.760739] 3f20: 0000000000000000 0000000000000000 000000000000002d 0000000000000000 [ 223.768568] 3f40: 0000aaaae19b9f68 0000ffffa8f411f0 0000000000000000 0000aaaae19b9000 [ 223.776398] 3f60: 0000fffffaa533b8 0000fffffaa531f0 0000000000000000 0000000000000001 [ 223.784227] 3f80: 0000fffffaa53ec6 0000000000000000 0000aaaafeb4b200 0000aaaafeb4a010 [ 223.792057] 3fa0: 0000000000000000 0000fffffaa53130 0000aaaae199f36c 0000fffffaa53130 [ 223.799886] 3fc0: 0000ffffa8f411f8 0000000000000000 0000aaaafeb4b268 000000000000006a [ 223.807715] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 223.815546] [<ffff0000080832c0>] el0_svc_naked+0x34/0x38 [ 223.820855] ---[ end trace eaf8939a3698b1aa ]---
Fix this issue by only disable clock and regulators in remove callback when CONFIG_PM isn't defined
Signed-off-by: Jiada Wang jiada_wang@mentor.com Signed-off-by: Mark Brown broonie@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- sound/soc/codecs/pcm3168a.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/sound/soc/codecs/pcm3168a.c b/sound/soc/codecs/pcm3168a.c index 3356c91f55b0..e3de1ff3b6c2 100644 --- a/sound/soc/codecs/pcm3168a.c +++ b/sound/soc/codecs/pcm3168a.c @@ -688,15 +688,22 @@ int pcm3168a_probe(struct device *dev, struct regmap *regmap) } EXPORT_SYMBOL_GPL(pcm3168a_probe);
-void pcm3168a_remove(struct device *dev) +static void pcm3168a_disable(struct device *dev) { struct pcm3168a_priv *pcm3168a = dev_get_drvdata(dev);
- pm_runtime_disable(dev); regulator_bulk_disable(ARRAY_SIZE(pcm3168a->supplies), - pcm3168a->supplies); + pcm3168a->supplies); clk_disable_unprepare(pcm3168a->scki); } + +void pcm3168a_remove(struct device *dev) +{ + pm_runtime_disable(dev); +#ifndef CONFIG_PM + pcm3168a_disable(dev); +#endif +} EXPORT_SYMBOL_GPL(pcm3168a_remove);
#ifdef CONFIG_PM @@ -751,10 +758,7 @@ static int pcm3168a_rt_suspend(struct device *dev)
regcache_cache_only(pcm3168a->regmap, true);
- regulator_bulk_disable(ARRAY_SIZE(pcm3168a->supplies), - pcm3168a->supplies); - - clk_disable_unprepare(pcm3168a->scki); + pcm3168a_disable(dev);
return 0; }
From: Manish Rangankar manish.rangankar@cavium.com
[ Upstream commit d5632b11f0a17efa6356311e535ae135d178438d ]
The kernel panic was observed after switch side perturbation,
BUG: unable to handle kernel NULL pointer dereference at (null) IP: [<ffffffff8132b5a0>] strcmp+0x20/0x40 PGD 0 Oops: 0000 [#1] SMP CPU: 8 PID: 647 Comm: kworker/8:1 Tainted: G W OE ------------ 3.10.0-693.el7.x86_64 #1 Hardware name: HPE ProLiant DL380 Gen10/ProLiant DL380 Gen10, BIOS U30 06/20/2018 Workqueue: slowpath-13:00. qed_slowpath_task [qed] task: ffff880429eb8fd0 ti: ffff880429190000 task.ti: ffff880429190000 RIP: 0010:[<ffffffff8132b5a0>] [<ffffffff8132b5a0>] strcmp+0x20/0x40 RSP: 0018:ffff880429193c68 EFLAGS: 00010202 RAX: 000000000000000a RBX: 0000000000000002 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff88042bda7a41 RBP: ffff880429193c68 R08: 000000000000ffff R09: 000000000000ffff R10: 0000000000000007 R11: ffff88042b3af338 R12: ffff880420b007a0 R13: ffff88081aa56af8 R14: 0000000000000001 R15: ffff88081aa50410 FS: 0000000000000000(0000) GS:ffff88042fe00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000000019f2000 CR4: 00000000003407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffff880429193d20 ffffffffc02a0c90 ffffc90004b32000 ffff8803fd3ec600 ffff88042bda7800 ffff88042bda7a00 ffff88042bda7840 ffff88042bda7a40 0000000129193d10 2e3836312e323931 ff000a342e363232 ffffffffc01ad99d Call Trace: [<ffffffffc02a0c90>] qedi_get_protocol_tlv_data+0x270/0x470 [qedi] [<ffffffffc01ad99d>] ? qed_mfw_process_tlv_req+0x24d/0xbf0 [qed] [<ffffffffc01653ae>] qed_mfw_fill_tlv_data+0x5e/0xd0 [qed] [<ffffffffc01ad9b9>] qed_mfw_process_tlv_req+0x269/0xbf0 [qed]
Fix kernel NULL pointer deref by checking for session is online before getting iSCSI TLV data.
Signed-off-by: Manish Rangankar manish.rangankar@cavium.com Reviewed-by: Lee Duncan lduncan@suse.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/scsi/qedi/qedi_main.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index e5bd035ebad0..4de740da547b 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -952,6 +952,9 @@ static int qedi_find_boot_info(struct qedi_ctx *qedi, cls_sess = iscsi_conn_to_session(cls_conn); sess = cls_sess->dd_data;
+ if (!iscsi_is_session_online(cls_sess)) + continue; + if (pri_ctrl_flags) { if (!strcmp(pri_tgt->iscsi_name, sess->targetname) && !strcmp(pri_tgt->ip_addr, ep_ip_addr)) {
From: Chris Wilson chris@chris-wilson.co.uk
[ Upstream commit 3b34c14fd50c302db091f020f26dd00ede902c80 ]
As amd_uvd_resume() accesses the uvd ring, it must be initialised first or else we trigger errors like:
[ 5.595963] [drm] Found UVD firmware Version: 1.87 Family ID: 17 [ 5.595969] [drm] PSP loading UVD firmware [ 5.596266] ------------[ cut here ]------------ [ 5.596268] ODEBUG: assert_init not available (active state 0) object type: timer_list hint: (null) [ 5.596285] WARNING: CPU: 0 PID: 507 at lib/debugobjects.c:329 debug_print_object+0x6a/0x80 [ 5.596286] Modules linked in: amdgpu(+) hid_logitech_hidpp(+) chash gpu_sched amd_iommu_v2 ttm drm_kms_helper crc32c_intel drm hid_sony ff_memless igb hid_logitech_dj nvme dca i2c_algo_bit nvme_core wmi pinctrl_amd uas usb_storage [ 5.596299] CPU: 0 PID: 507 Comm: systemd-udevd Tainted: G W 4.20.0-0.rc1.git4.1.fc30.x86_64 #1 [ 5.596301] Hardware name: System manufacturer System Product Name/ROG STRIX X470-I GAMING, BIOS 0901 07/23/2018 [ 5.596303] RIP: 0010:debug_print_object+0x6a/0x80 [ 5.596305] Code: 8b 43 10 83 c2 01 8b 4b 14 4c 89 e6 89 15 e6 82 b0 02 4c 8b 45 00 48 c7 c7 60 fd 34 a6 48 8b 14 c5 a0 da 08 a6 e8 6a 6a b8 ff <0f> 0b 5b 83 05 d0 45 3e 01 01 5d 41 5c c3 83 05 c5 45 3e 01 01 c3 [ 5.596306] RSP: 0018:ffffa02ac863f8c0 EFLAGS: 00010282 [ 5.596307] RAX: 0000000000000000 RBX: ffffa02ac863f8e0 RCX: 0000000000000006 [ 5.596308] RDX: 0000000000000007 RSI: ffff9160e9a7bfe8 RDI: ffff9160f91d6c60 [ 5.596310] RBP: ffffffffa6742740 R08: 0000000000000002 R09: 0000000000000000 [ 5.596311] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffffa634ff69 [ 5.596312] R13: 00000000000b79d0 R14: ffffffffa80f76d8 R15: 0000000000266000 [ 5.596313] FS: 00007f762abf7940(0000) GS:ffff9160f9000000(0000) knlGS:0000000000000000 [ 5.596314] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5.596315] CR2: 000055fdc593f000 CR3: 00000007e999c000 CR4: 00000000003406f0 [ 5.596317] Call Trace: [ 5.596321] debug_object_assert_init+0x14a/0x180 [ 5.596327] del_timer+0x2e/0x90 [ 5.596383] amdgpu_fence_process+0x47/0x100 [amdgpu] [ 5.596430] amdgpu_uvd_resume+0xf6/0x120 [amdgpu] [ 5.596475] uvd_v7_0_sw_init+0xe0/0x280 [amdgpu] [ 5.596523] amdgpu_device_init.cold.30+0xf97/0x14b6 [amdgpu] [ 5.596563] ? amdgpu_driver_load_kms+0x53/0x330 [amdgpu] [ 5.596604] amdgpu_driver_load_kms+0x86/0x330 [amdgpu] [ 5.596614] drm_dev_register+0x115/0x150 [drm] [ 5.596654] amdgpu_pci_probe+0xbd/0x120 [amdgpu] [ 5.596658] local_pci_probe+0x41/0x90 [ 5.596661] pci_device_probe+0x188/0x1a0 [ 5.596666] really_probe+0xf8/0x3b0 [ 5.596669] driver_probe_device+0xb3/0xf0 [ 5.596672] __driver_attach+0xe1/0x110 [ 5.596674] ? driver_probe_device+0xf0/0xf0 [ 5.596676] bus_for_each_dev+0x79/0xc0 [ 5.596679] bus_add_driver+0x155/0x230 [ 5.596681] ? 0xffffffffc07d9000 [ 5.596683] driver_register+0x6b/0xb0 [ 5.596685] ? 0xffffffffc07d9000 [ 5.596688] do_one_initcall+0x5d/0x2be [ 5.596691] ? rcu_read_lock_sched_held+0x79/0x80 [ 5.596693] ? kmem_cache_alloc_trace+0x264/0x290 [ 5.596695] ? do_init_module+0x22/0x210 [ 5.596698] do_init_module+0x5a/0x210 [ 5.596701] load_module+0x2137/0x2430 [ 5.596703] ? lockdep_hardirqs_on+0xed/0x180 [ 5.596714] ? __do_sys_init_module+0x150/0x1a0 [ 5.596715] __do_sys_init_module+0x150/0x1a0 [ 5.596722] do_syscall_64+0x60/0x1f0 [ 5.596725] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 5.596726] RIP: 0033:0x7f762b877dee [ 5.596728] Code: 48 8b 0d 9d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 6a 20 0c 00 f7 d8 64 89 01 48 [ 5.596729] RSP: 002b:00007ffc777b8558 EFLAGS: 00000246 ORIG_RAX: 00000000000000af [ 5.596730] RAX: ffffffffffffffda RBX: 000055fdc48da320 RCX: 00007f762b877dee [ 5.596731] RDX: 00007f762b9f284d RSI: 00000000006c5fc6 RDI: 000055fdc527a060 [ 5.596732] RBP: 00007f762b9f284d R08: 0000000000000003 R09: 0000000000000002 [ 5.596733] R10: 000055fdc48ad010 R11: 0000000000000246 R12: 000055fdc527a060 [ 5.596734] R13: 000055fdc48dca20 R14: 0000000000020000 R15: 0000000000000000 [ 5.596740] irq event stamp: 134618 [ 5.596743] hardirqs last enabled at (134617): [<ffffffffa513d52e>] console_unlock+0x45e/0x610 [ 5.596744] hardirqs last disabled at (134618): [<ffffffffa50037e8>] trace_hardirqs_off_thunk+0x1a/0x1c [ 5.596746] softirqs last enabled at (133146): [<ffffffffa5e00365>] __do_softirq+0x365/0x47c [ 5.596748] softirqs last disabled at (133139): [<ffffffffa50c64f9>] irq_exit+0x119/0x120 [ 5.596749] ---[ end trace eaee508abfebccdc ]---
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108709 Reviewed-by: Christian König christian.koenig@amd.com Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Alex Deucher alexdeucher@gmail.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c index 8a926d1df939..2b4199adcd94 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c @@ -116,16 +116,16 @@ static int uvd_v4_2_sw_init(void *handle) if (r) return r;
- r = amdgpu_uvd_resume(adev); - if (r) - return r; - ring = &adev->uvd.inst->ring; sprintf(ring->name, "uvd"); r = amdgpu_ring_init(adev, ring, 512, &adev->uvd.inst->irq, 0); if (r) return r;
+ r = amdgpu_uvd_resume(adev); + if (r) + return r; + r = amdgpu_uvd_entity_init(adev);
return r; diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c index 50248059412e..88c006c5ee2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c @@ -113,16 +113,16 @@ static int uvd_v5_0_sw_init(void *handle) if (r) return r;
- r = amdgpu_uvd_resume(adev); - if (r) - return r; - ring = &adev->uvd.inst->ring; sprintf(ring->name, "uvd"); r = amdgpu_ring_init(adev, ring, 512, &adev->uvd.inst->irq, 0); if (r) return r;
+ r = amdgpu_uvd_resume(adev); + if (r) + return r; + r = amdgpu_uvd_entity_init(adev);
return r; diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c index 6ae82cc2e55e..d4070839ac80 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c @@ -420,16 +420,16 @@ static int uvd_v6_0_sw_init(void *handle) DRM_INFO("UVD ENC is disabled\n"); }
- r = amdgpu_uvd_resume(adev); - if (r) - return r; - ring = &adev->uvd.inst->ring; sprintf(ring->name, "uvd"); r = amdgpu_ring_init(adev, ring, 512, &adev->uvd.inst->irq, 0); if (r) return r;
+ r = amdgpu_uvd_resume(adev); + if (r) + return r; + if (uvd_v6_0_enc_support(adev)) { for (i = 0; i < adev->uvd.num_enc_rings; ++i) { ring = &adev->uvd.inst->ring_enc[i]; diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c index 9b7f8469bc5c..057151b17b45 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c @@ -444,10 +444,6 @@ static int uvd_v7_0_sw_init(void *handle) DRM_INFO("PSP loading UVD firmware\n"); }
- r = amdgpu_uvd_resume(adev); - if (r) - return r; - for (j = 0; j < adev->uvd.num_uvd_inst; j++) { if (adev->uvd.harvest_config & (1 << j)) continue; @@ -479,6 +475,10 @@ static int uvd_v7_0_sw_init(void *handle) } }
+ r = amdgpu_uvd_resume(adev); + if (r) + return r; + r = amdgpu_uvd_entity_init(adev); if (r) return r;
From: Chuck Lever chuck.lever@oracle.com
[ Upstream commit b024dd0eba6e6d568f69d63c5e3153aba94c23e3 ]
FRWR memory registration is done with a series of calls and WRs. 1. ULP invokes ib_dma_map_sg() 2. ULP invokes ib_map_mr_sg() 3. ULP posts an IB_WR_REG_MR on the Send queue
Step 2 generates an iova. It is permissible for ULPs to change this iova (with certain restrictions) between steps 2 and 3.
rxe_map_mr_sg captures the MR's iova but later when rxe processes the REG_MR WR, it ignores the MR's iova field. If a ULP alters the MR's iova after step 2 but before step 3, rxe never captures that change.
When the remote sends an RDMA Read targeting that MR, rxe looks up the R_key, but the altered iova does not match the iova stored in the MR, causing the RDMA Read request to fail.
Reported-by: Anna Schumaker schumaker.anna@gmail.com Signed-off-by: Chuck Lever chuck.lever@oracle.com Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Jason Gunthorpe jgg@mellanox.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/infiniband/sw/rxe/rxe_req.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 8be27238a86e..fa98a5279647 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -640,6 +640,7 @@ int rxe_requester(void *arg) rmr->access = wqe->wr.wr.reg.access; rmr->lkey = wqe->wr.wr.reg.key; rmr->rkey = wqe->wr.wr.reg.key; + rmr->iova = wqe->wr.wr.reg.mr->iova; wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; } else {
From: Nathan Chancellor natechancellor@gmail.com
[ Upstream commit 3db5e0ba8b8f4aee631d7ee04b7a11c56cfdc213 ]
When building the kernel with Clang, some disabled warnings appear because this Makefile overrides KBUILD_CFLAGS for x86{,_64}. Add them to this list so that the build is clean again.
-Wpointer-sign was disabled for the whole kernel before the beginning of Git history.
-Waddress-of-packed-member was disabled for the whole kernel and for the early boot code in these commits:
bfb38988c51e ("kbuild: clang: Disable 'address-of-packed-member' warning") 20c6c1890455 ("x86/boot: Disable the address-of-packed-member compiler warning").
-Wgnu was disabled for the whole kernel and for the early boot code in these commits:
61163efae020 ("kbuild: LLVMLinux: Add Kbuild support for building kernel with Clang") 6c3b56b19730 ("x86/boot: Disable Clang warnings about GNU extensions").
[ mingo: Made the changelog more readable. ]
Tested-by: Sedat Dilek sedat.dilek@gmail.com Signed-off-by: Nathan Chancellor natechancellor@gmail.com Signed-off-by: Ard Biesheuvel ard.biesheuvel@linaro.org Reviewed-by: Sedat Dilek sedat.dilek@gmail.com Cc: Andy Lutomirski luto@kernel.org Cc: Arend van Spriel arend.vanspriel@broadcom.com Cc: Bhupesh Sharma bhsharma@redhat.com Cc: Borislav Petkov bp@alien8.de Cc: Dave Hansen dave.hansen@intel.com Cc: Eric Snowberg eric.snowberg@oracle.com Cc: Hans de Goede hdegoede@redhat.com Cc: Joe Perches joe@perches.com Cc: Jon Hunter jonathanh@nvidia.com Cc: Julien Thierry julien.thierry@arm.com Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Marc Zyngier marc.zyngier@arm.com Cc: Matt Fleming matt@codeblueprint.co.uk Cc: Peter Zijlstra peterz@infradead.org Cc: Sai Praneeth Prakhya sai.praneeth.prakhya@intel.com Cc: Thomas Gleixner tglx@linutronix.de Cc: YiFei Zhu zhuyifei1999@gmail.com Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20181129171230.18699-8-ard.biesheuvel@linaro.org Link: https://github.com/ClangBuiltLinux/linux/issues/112 Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/firmware/efi/libstub/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index c51627660dbb..d9845099635e 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -9,7 +9,10 @@ cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ -O2 \ -fPIC -fno-strict-aliasing -mno-red-zone \ - -mno-mmx -mno-sse -fshort-wchar + -mno-mmx -mno-sse -fshort-wchar \ + -Wno-pointer-sign \ + $(call cc-disable-warning, address-of-packed-member) \ + $(call cc-disable-warning, gnu)
# arm64 uses the full KBUILD_CFLAGS so it's necessary to explicitly # disable the stackleak plugin
From: Daniel Santos daniel.santos@pobox.com
[ Upstream commit a788c5272769ddbcdbab297cf386413eeac04463 ]
jffs2_sync_fs makes the assumption that if CONFIG_JFFS2_FS_WRITEBUFFER is defined then a write buffer is available and has been initialized. However, this does is not the case when the mtd device has no out-of-band buffer:
int jffs2_nand_flash_setup(struct jffs2_sb_info *c) { if (!c->mtd->oobsize) return 0; ...
The resulting call to cancel_delayed_work_sync passing a uninitialized (but zeroed) delayed_work struct forces lockdep to become disabled.
[ 90.050639] overlayfs: upper fs does not support tmpfile. [ 90.652264] INFO: trying to register non-static key. [ 90.662171] the code is fine but needs lockdep annotation. [ 90.673090] turning off the locking correctness validator. [ 90.684021] CPU: 0 PID: 1762 Comm: mount_root Not tainted 4.14.63 #0 [ 90.696672] Stack : 00000000 00000000 80d8f6a2 00000038 805f0000 80444600 8fe364f4 805dfbe7 [ 90.713349] 80563a30 000006e2 8068370c 00000001 00000000 00000001 8e2fdc48 ffffffff [ 90.730020] 00000000 00000000 80d90000 00000000 00000106 00000000 6465746e 312e3420 [ 90.746690] 6b636f6c 03bf0000 f8000000 20676e69 00000000 80000000 00000000 8e2c2a90 [ 90.763362] 80d90000 00000001 00000000 8e2c2a90 00000003 80260dc0 08052098 80680000 [ 90.780033] ... [ 90.784902] Call Trace: [ 90.789793] [<8000f0d8>] show_stack+0xb8/0x148 [ 90.798659] [<8005a000>] register_lock_class+0x270/0x55c [ 90.809247] [<8005cb64>] __lock_acquire+0x13c/0xf7c [ 90.818964] [<8005e314>] lock_acquire+0x194/0x1dc [ 90.828345] [<8003f27c>] flush_work+0x200/0x24c [ 90.837374] [<80041dfc>] __cancel_work_timer+0x158/0x210 [ 90.847958] [<801a8770>] jffs2_sync_fs+0x20/0x54 [ 90.857173] [<80125cf4>] iterate_supers+0xf4/0x120 [ 90.866729] [<80158fc4>] sys_sync+0x44/0x9c [ 90.875067] [<80014424>] syscall_common+0x34/0x58
Signed-off-by: Daniel Santos daniel.santos@pobox.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Boris Brezillon boris.brezillon@bootlin.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/jffs2/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 902a7dd10e5c..bb6ae387469f 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
#ifdef CONFIG_JFFS2_FS_WRITEBUFFER - cancel_delayed_work_sync(&c->wbuf_dwork); + if (jffs2_is_writebuffered(c)) + cancel_delayed_work_sync(&c->wbuf_dwork); #endif
mutex_lock(&c->alloc_sem);
From: "A.s. Dong" aisheng.dong@nxp.com
[ Upstream commit 9e5ef7a57ca75a1b9411c46caeeb6881124284a3 ]
As the commit 2893c379461a ("clk: make strings in parent name arrays const"), let's make the parent strings const, otherwise we may meet the following warning when compiling:
drivers/clk/imx/clk-imx7ulp.c: In function 'imx7ulp_clocks_init': drivers/clk/imx/clk-imx7ulp.c:73:35: warning: passing argument 5 of 'imx_clk_mux_flags' discards 'const' qualifier from pointer target type
clks[IMX7ULP_CLK_APLL_PRE_SEL] = imx_clk_mux_flags("apll_pre_sel", base + 0x508, 0, 1, pll_pre_sels, ARRAY_SIZE(pll_pre_sels), CLK_SET_PARENT_GATE); ^ In file included from drivers/clk/imx/clk-imx7ulp.c:23:0: drivers/clk/imx/clk.h:200:27: note: expected 'const char **' but argument is of type 'const char * const*' ...
Cc: Stephen Boyd sboyd@codeaurora.org Cc: Michael Turquette mturquette@baylibre.com Cc: Shawn Guo shawnguo@kernel.org Signed-off-by: Dong Aisheng aisheng.dong@nxp.com Signed-off-by: Stephen Boyd sboyd@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/clk/imx/clk-busy.c | 2 +- drivers/clk/imx/clk-fixup-mux.c | 2 +- drivers/clk/imx/clk.h | 18 +++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/drivers/clk/imx/clk-busy.c b/drivers/clk/imx/clk-busy.c index 99036527eb0d..e695622c5aa5 100644 --- a/drivers/clk/imx/clk-busy.c +++ b/drivers/clk/imx/clk-busy.c @@ -154,7 +154,7 @@ static const struct clk_ops clk_busy_mux_ops = {
struct clk *imx_clk_busy_mux(const char *name, void __iomem *reg, u8 shift, u8 width, void __iomem *busy_reg, u8 busy_shift, - const char **parent_names, int num_parents) + const char * const *parent_names, int num_parents) { struct clk_busy_mux *busy; struct clk *clk; diff --git a/drivers/clk/imx/clk-fixup-mux.c b/drivers/clk/imx/clk-fixup-mux.c index c9b327e0a8dd..44817c1b0b88 100644 --- a/drivers/clk/imx/clk-fixup-mux.c +++ b/drivers/clk/imx/clk-fixup-mux.c @@ -70,7 +70,7 @@ static const struct clk_ops clk_fixup_mux_ops = { };
struct clk *imx_clk_fixup_mux(const char *name, void __iomem *reg, - u8 shift, u8 width, const char **parents, + u8 shift, u8 width, const char * const *parents, int num_parents, void (*fixup)(u32 *val)) { struct clk_fixup_mux *fixup_mux; diff --git a/drivers/clk/imx/clk.h b/drivers/clk/imx/clk.h index 8076ec040f37..e65c1115d978 100644 --- a/drivers/clk/imx/clk.h +++ b/drivers/clk/imx/clk.h @@ -63,14 +63,14 @@ struct clk *imx_clk_busy_divider(const char *name, const char *parent_name,
struct clk *imx_clk_busy_mux(const char *name, void __iomem *reg, u8 shift, u8 width, void __iomem *busy_reg, u8 busy_shift, - const char **parent_names, int num_parents); + const char * const *parent_names, int num_parents);
struct clk *imx_clk_fixup_divider(const char *name, const char *parent, void __iomem *reg, u8 shift, u8 width, void (*fixup)(u32 *val));
struct clk *imx_clk_fixup_mux(const char *name, void __iomem *reg, - u8 shift, u8 width, const char **parents, + u8 shift, u8 width, const char * const *parents, int num_parents, void (*fixup)(u32 *val));
static inline struct clk *imx_clk_fixed(const char *name, int rate) @@ -79,7 +79,8 @@ static inline struct clk *imx_clk_fixed(const char *name, int rate) }
static inline struct clk *imx_clk_mux_ldb(const char *name, void __iomem *reg, - u8 shift, u8 width, const char **parents, int num_parents) + u8 shift, u8 width, const char * const *parents, + int num_parents) { return clk_register_mux(NULL, name, parents, num_parents, CLK_SET_RATE_NO_REPARENT | CLK_SET_RATE_PARENT, reg, @@ -192,7 +193,8 @@ static inline struct clk *imx_clk_gate4(const char *name, const char *parent, }
static inline struct clk *imx_clk_mux(const char *name, void __iomem *reg, - u8 shift, u8 width, const char **parents, int num_parents) + u8 shift, u8 width, const char * const *parents, + int num_parents) { return clk_register_mux(NULL, name, parents, num_parents, CLK_SET_RATE_NO_REPARENT, reg, shift, @@ -200,7 +202,8 @@ static inline struct clk *imx_clk_mux(const char *name, void __iomem *reg, }
static inline struct clk *imx_clk_mux2(const char *name, void __iomem *reg, - u8 shift, u8 width, const char **parents, int num_parents) + u8 shift, u8 width, const char * const *parents, + int num_parents) { return clk_register_mux(NULL, name, parents, num_parents, CLK_SET_RATE_NO_REPARENT | CLK_OPS_PARENT_ENABLE, @@ -208,8 +211,9 @@ static inline struct clk *imx_clk_mux2(const char *name, void __iomem *reg, }
static inline struct clk *imx_clk_mux_flags(const char *name, - void __iomem *reg, u8 shift, u8 width, const char **parents, - int num_parents, unsigned long flags) + void __iomem *reg, u8 shift, u8 width, + const char * const *parents, int num_parents, + unsigned long flags) { return clk_register_mux(NULL, name, parents, num_parents, flags | CLK_SET_RATE_NO_REPARENT, reg, shift, width, 0,
From: "Joel Fernandes (Google)" joel@joelfernandes.org
[ Upstream commit 30696378f68a9e3dad6bfe55938b112e72af00c2 ]
The ramoops backend currently calls persistent_ram_save_old() even if a buffer is empty. While this appears to work, it is does not seem like the right thing to do and could lead to future bugs so lets avoid that. It also prevents misleading prints in the logs which claim the buffer is valid.
I got something like:
found existing buffer, size 0, start 0
When I was expecting:
no valid data in buffer (sig = ...)
This bails out early (and reports with pr_debug()), since it's an acceptable state.
Signed-off-by: Joel Fernandes (Google) joel@joelfernandes.org Co-developed-by: Kees Cook keescook@chromium.org Signed-off-by: Kees Cook keescook@chromium.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/pstore/ram_core.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 0792595ebcfb..3c777ec80d47 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -496,6 +496,11 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, sig ^= PERSISTENT_RAM_SIG;
if (prz->buffer->sig == sig) { + if (buffer_size(prz) == 0) { + pr_debug("found existing empty buffer\n"); + return 0; + } + if (buffer_size(prz) > prz->buffer_size || buffer_start(prz) > buffer_size(prz)) pr_info("found existing invalid buffer, size %zu, start %zu\n",
From: Daniel Axtens dja@axtens.net
[ Upstream commit 10e1fdb95809ed21406f53b5b4f064673a1b9ceb ]
Currently, disconnecting a USB webcam while it is in use prints out a number of warnings, such as:
WARNING: CPU: 2 PID: 3118 at /build/linux-ezBi1T/linux-4.8.0/fs/sysfs/group.c:237 sysfs_remove_group+0x8b/0x90 sysfs group ffffffffa7cd0780 not found for kobject 'event13'
This has been noticed before. [0]
This is because of the order in which things are torn down.
If there are no streams active during a USB disconnect:
- uvc_disconnect() is invoked via device_del() through the bus notifier mechanism.
- this calls uvc_unregister_video().
- uvc_unregister_video() unregisters the video device for each stream,
- because there are no streams open, it calls uvc_delete()
- uvc_delete() calls uvc_status_cleanup(), which cleans up the status input device.
- uvc_delete() calls media_device_unregister(), which cleans up the media device
- uvc_delete(), uvc_unregister_video() and uvc_disconnect() all return, and we end up back in device_del().
- device_del() then cleans up the sysfs folder for the camera with dpm_sysfs_remove(). Because uvc_status_cleanup() and media_device_unregister() have already been called, this all works nicely.
If, on the other hand, there *are* streams active during a USB disconnect:
- uvc_disconnect() is invoked
- this calls uvc_unregister_video()
- uvc_unregister_video() unregisters the video device for each stream,
- uvc_unregister_video() and uvc_disconnect() return, and we end up back in device_del().
- device_del() then cleans up the sysfs folder for the camera with dpm_sysfs_remove(). Because the status input device and the media device are children of the USB device, this also deletes their sysfs folders.
- Sometime later, the final stream is closed, invoking uvc_release().
- uvc_release() calls uvc_delete()
- uvc_delete() calls uvc_status_cleanup(), which cleans up the status input device. Because the sysfs directory has already been removed, this causes a WARNing.
- uvc_delete() calls media_device_unregister(), which cleans up the media device. Because the sysfs directory has already been removed, this causes another WARNing.
To fix this, we need to make sure the devices are always unregistered before the end of uvc_disconnect(). To this, move the unregistration into the disconnect path:
- split uvc_status_cleanup() into two parts, one on disconnect that unregisters and one on delete that frees.
- move v4l2_device_unregister() and media_device_unregister() into the disconnect path.
[0]: https://lkml.org/lkml/2016/12/8/657
[Renamed uvc_input_cleanup() to uvc_input_unregister()]
Signed-off-by: Daniel Axtens dja@axtens.net Acked-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Laurent Pinchart laurent.pinchart@ideasonboard.com Signed-off-by: Mauro Carvalho Chehab mchehab+samsung@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/media/usb/uvc/uvc_driver.c | 13 +++++++++---- drivers/media/usb/uvc/uvc_status.c | 12 ++++++++---- drivers/media/usb/uvc/uvcvideo.h | 1 + 3 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c index d46dc432456c..361abbc00486 100644 --- a/drivers/media/usb/uvc/uvc_driver.c +++ b/drivers/media/usb/uvc/uvc_driver.c @@ -1824,11 +1824,7 @@ static void uvc_delete(struct kref *kref) usb_put_intf(dev->intf); usb_put_dev(dev->udev);
- if (dev->vdev.dev) - v4l2_device_unregister(&dev->vdev); #ifdef CONFIG_MEDIA_CONTROLLER - if (media_devnode_is_registered(dev->mdev.devnode)) - media_device_unregister(&dev->mdev); media_device_cleanup(&dev->mdev); #endif
@@ -1885,6 +1881,15 @@ static void uvc_unregister_video(struct uvc_device *dev)
uvc_debugfs_cleanup_stream(stream); } + + uvc_status_unregister(dev); + + if (dev->vdev.dev) + v4l2_device_unregister(&dev->vdev); +#ifdef CONFIG_MEDIA_CONTROLLER + if (media_devnode_is_registered(dev->mdev.devnode)) + media_device_unregister(&dev->mdev); +#endif }
int uvc_register_video_device(struct uvc_device *dev, diff --git a/drivers/media/usb/uvc/uvc_status.c b/drivers/media/usb/uvc/uvc_status.c index 0722dc684378..883e4cab45e7 100644 --- a/drivers/media/usb/uvc/uvc_status.c +++ b/drivers/media/usb/uvc/uvc_status.c @@ -54,7 +54,7 @@ static int uvc_input_init(struct uvc_device *dev) return ret; }
-static void uvc_input_cleanup(struct uvc_device *dev) +static void uvc_input_unregister(struct uvc_device *dev) { if (dev->input) input_unregister_device(dev->input); @@ -71,7 +71,7 @@ static void uvc_input_report_key(struct uvc_device *dev, unsigned int code,
#else #define uvc_input_init(dev) -#define uvc_input_cleanup(dev) +#define uvc_input_unregister(dev) #define uvc_input_report_key(dev, code, value) #endif /* CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV */
@@ -292,12 +292,16 @@ int uvc_status_init(struct uvc_device *dev) return 0; }
-void uvc_status_cleanup(struct uvc_device *dev) +void uvc_status_unregister(struct uvc_device *dev) { usb_kill_urb(dev->int_urb); + uvc_input_unregister(dev); +} + +void uvc_status_cleanup(struct uvc_device *dev) +{ usb_free_urb(dev->int_urb); kfree(dev->status); - uvc_input_cleanup(dev); }
int uvc_status_start(struct uvc_device *dev, gfp_t flags) diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h index e5f5d84f1d1d..a738486fd9d6 100644 --- a/drivers/media/usb/uvc/uvcvideo.h +++ b/drivers/media/usb/uvc/uvcvideo.h @@ -750,6 +750,7 @@ int uvc_register_video_device(struct uvc_device *dev,
/* Status */ int uvc_status_init(struct uvc_device *dev); +void uvc_status_unregister(struct uvc_device *dev); void uvc_status_cleanup(struct uvc_device *dev); int uvc_status_start(struct uvc_device *dev, gfp_t flags); void uvc_status_stop(struct uvc_device *dev);
From: Breno Leitao leitao@debian.org
[ Upstream commit 8d4a862276a9c30a269d368d324fb56529e6d5fd ]
Currently xmon needs to get devtree_lock (through rtas_token()) during its invocation (at crash time). If there is a crash while devtree_lock is being held, then xmon tries to get the lock but spins forever and never get into the interactive debugger, as in the following case:
int *ptr = NULL; raw_spin_lock_irqsave(&devtree_lock, flags); *ptr = 0xdeadbeef;
This patch avoids calling rtas_token(), thus trying to get the same lock, at crash time. This new mechanism proposes getting the token at initialization time (xmon_init()) and just consuming it at crash time.
This would allow xmon to be possible invoked independent of devtree_lock being held or not.
Signed-off-by: Breno Leitao leitao@debian.org Reviewed-by: Thiago Jung Bauermann bauerman@linux.ibm.com Signed-off-by: Michael Ellerman mpe@ellerman.id.au Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/xmon/xmon.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 4264aedc7775..dd6badc31f45 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -75,6 +75,9 @@ static int xmon_gate; #define xmon_owner 0 #endif /* CONFIG_SMP */
+#ifdef CONFIG_PPC_PSERIES +static int set_indicator_token = RTAS_UNKNOWN_SERVICE; +#endif static unsigned long in_xmon __read_mostly = 0; static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT);
@@ -358,7 +361,6 @@ static inline void disable_surveillance(void) #ifdef CONFIG_PPC_PSERIES /* Since this can't be a module, args should end up below 4GB. */ static struct rtas_args args; - int token;
/* * At this point we have got all the cpus we can into @@ -367,11 +369,11 @@ static inline void disable_surveillance(void) * If we did try to take rtas.lock there would be a * real possibility of deadlock. */ - token = rtas_token("set-indicator"); - if (token == RTAS_UNKNOWN_SERVICE) + if (set_indicator_token == RTAS_UNKNOWN_SERVICE) return;
- rtas_call_unlocked(&args, token, 3, 1, NULL, SURVEILLANCE_TOKEN, 0, 0); + rtas_call_unlocked(&args, set_indicator_token, 3, 1, NULL, + SURVEILLANCE_TOKEN, 0, 0);
#endif /* CONFIG_PPC_PSERIES */ } @@ -3672,6 +3674,14 @@ static void xmon_init(int enable) __debugger_iabr_match = xmon_iabr_match; __debugger_break_match = xmon_break_match; __debugger_fault_handler = xmon_fault_handler; + +#ifdef CONFIG_PPC_PSERIES + /* + * Get the token here to avoid trying to get a lock + * during the crash, causing a deadlock. + */ + set_indicator_token = rtas_token("set-indicator"); +#endif } else { __debugger = NULL; __debugger_ipi = NULL;
From: Breno Leitao leitao@debian.org
[ Upstream commit 2b038cbc5fcf12a7ee1cc9bfd5da1e46dacdee87 ]
When booting a pseries kernel with PREEMPT enabled, it dumps the following warning:
BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1 caller is pseries_processor_idle_init+0x5c/0x22c CPU: 13 PID: 1 Comm: swapper/0 Not tainted 4.20.0-rc3-00090-g12201a0128bc-dirty #828 Call Trace: [c000000429437ab0] [c0000000009c8878] dump_stack+0xec/0x164 (unreliable) [c000000429437b00] [c0000000005f2f24] check_preemption_disabled+0x154/0x160 [c000000429437b90] [c000000000cab8e8] pseries_processor_idle_init+0x5c/0x22c [c000000429437c10] [c000000000010ed4] do_one_initcall+0x64/0x300 [c000000429437ce0] [c000000000c54500] kernel_init_freeable+0x3f0/0x500 [c000000429437db0] [c0000000000112dc] kernel_init+0x2c/0x160 [c000000429437e20] [c00000000000c1d0] ret_from_kernel_thread+0x5c/0x6c
This happens because the code calls get_lppaca() which calls get_paca() and it checks if preemption is disabled through check_preemption_disabled().
Preemption should be disabled because the per CPU variable may make no sense if there is a preemption (and a CPU switch) after it reads the per CPU data and when it is used.
In this device driver specifically, it is not a problem, because this code just needs to have access to one lppaca struct, and it does not matter if it is the current per CPU lppaca struct or not (i.e. when there is a preemption and a CPU migration).
That said, the most appropriate fix seems to be related to avoiding the debug_smp_processor_id() call at get_paca(), instead of calling preempt_disable() before get_paca().
Signed-off-by: Breno Leitao leitao@debian.org Signed-off-by: Michael Ellerman mpe@ellerman.id.au Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/cpuidle/cpuidle-pseries.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index 9e56bc411061..74c247972bb3 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -247,7 +247,13 @@ static int pseries_idle_probe(void) return -ENODEV;
if (firmware_has_feature(FW_FEATURE_SPLPAR)) { - if (lppaca_shared_proc(get_lppaca())) { + /* + * Use local_paca instead of get_lppaca() since + * preemption is not disabled, and it is not required in + * fact, since lppaca_ptr does not need to be the value + * associated to the current CPU, it can be from any CPU. + */ + if (lppaca_shared_proc(local_paca->lppaca_ptr)) { cpuidle_state_table = shared_states; max_idle_state = ARRAY_SIZE(shared_states); } else {
From: Ard Biesheuvel ard.biesheuvel@linaro.org
[ Upstream commit 3bbd3db86470c701091fb1d67f1fab6621debf50 ]
readelf complains about the section layout of vmlinux when building with CONFIG_RELOCATABLE=y (for KASLR):
readelf: Warning: [21]: Link field (0) should index a symtab section. readelf: Warning: [21]: Info field (0) should index a relocatable section.
Also, it seems that our use of '-pie -shared' is contradictory, and thus ambiguous. In general, the way KASLR is wired up at the moment is highly tailored to how ld.bfd happens to implement (and conflate) PIE executables and shared libraries, so given the current effort to support other toolchains, let's fix some of these issues as well.
- Drop the -pie linker argument and just leave -shared. In ld.bfd, the differences between them are unclear (except for the ELF type of the produced image [0]) but lld chokes on seeing both at the same time.
- Rename the .rela output section to .rela.dyn, as is customary for shared libraries and PIE executables, so that it is not misidentified by readelf as a static relocation section (producing the warnings above).
- Pass the -z notext and -z norelro options to explicitly instruct the linker to permit text relocations, and to omit the RELRO program header (which requires a certain section layout that we don't adhere to in the kernel). These are the defaults for current versions of ld.bfd.
- Discard .eh_frame and .gnu.hash sections to avoid them from being emitted between .head.text and .text, screwing up the section layout.
These changes only affect the ELF image, and produce the same binary image.
[0] b9dce7f1ba01 ("arm64: kernel: force ET_DYN ELF type for ...")
Cc: Nick Desaulniers ndesaulniers@google.com Cc: Peter Smith peter.smith@linaro.org Tested-by: Nick Desaulniers ndesaulniers@google.com Signed-off-by: Ard Biesheuvel ard.biesheuvel@linaro.org Signed-off-by: Will Deacon will.deacon@arm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/Makefile | 2 +- arch/arm64/kernel/vmlinux.lds.S | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 106039d25e2f..35649ee8ad56 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -18,7 +18,7 @@ ifeq ($(CONFIG_RELOCATABLE), y) # Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour # for relative relocs, since this leads to better Image compression # with the relocation offsets always being zero. -LDFLAGS_vmlinux += -pie -shared -Bsymbolic \ +LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \ $(call ld-option, --no-apply-dynamic-relocs) endif
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 605d1b60469c..74e469f8a850 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -99,7 +99,8 @@ SECTIONS *(.discard) *(.discard.*) *(.interp .dynamic) - *(.dynsym .dynstr .hash) + *(.dynsym .dynstr .hash .gnu.hash) + *(.eh_frame) }
. = KIMAGE_VADDR + TEXT_OFFSET; @@ -176,12 +177,12 @@ SECTIONS
PERCPU_SECTION(L1_CACHE_BYTES)
- .rela : ALIGN(8) { + .rela.dyn : ALIGN(8) { *(.rela .rela*) }
- __rela_offset = ABSOLUTE(ADDR(.rela) - KIMAGE_VADDR); - __rela_size = SIZEOF(.rela); + __rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR); + __rela_size = SIZEOF(.rela.dyn);
. = ALIGN(SEGMENT_ALIGN); __initdata_end = .;
From: Nathan Chancellor natechancellor@gmail.com
[ Upstream commit b2e9a4eda11fd2cb1e6714e9ad3f455c402568ff ]
Clang warns:
drivers/media/firewire/firedtv-avc.c:999:45: warning: implicit conversion from 'int' to 'char' changes value from 159 to -97 [-Wconstant-conversion] app_info[0] = (EN50221_TAG_APP_INFO >> 16) & 0xff; ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ drivers/media/firewire/firedtv-avc.c:1000:45: warning: implicit conversion from 'int' to 'char' changes value from 128 to -128 [-Wconstant-conversion] app_info[1] = (EN50221_TAG_APP_INFO >> 8) & 0xff; ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ drivers/media/firewire/firedtv-avc.c:1040:44: warning: implicit conversion from 'int' to 'char' changes value from 159 to -97 [-Wconstant-conversion] app_info[0] = (EN50221_TAG_CA_INFO >> 16) & 0xff; ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ drivers/media/firewire/firedtv-avc.c:1041:44: warning: implicit conversion from 'int' to 'char' changes value from 128 to -128 [-Wconstant-conversion] app_info[1] = (EN50221_TAG_CA_INFO >> 8) & 0xff; ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~ 4 warnings generated.
Change app_info's type to unsigned char to match the type of the member msg in struct ca_msg, which is the only thing passed into the app_info parameter in this function.
Link: https://github.com/ClangBuiltLinux/linux/issues/105
Signed-off-by: Nathan Chancellor natechancellor@gmail.com Signed-off-by: Mauro Carvalho Chehab mchehab+samsung@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/media/firewire/firedtv-avc.c | 6 ++++-- drivers/media/firewire/firedtv.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/media/firewire/firedtv-avc.c b/drivers/media/firewire/firedtv-avc.c index 1c933b2cf760..3ef5df1648d7 100644 --- a/drivers/media/firewire/firedtv-avc.c +++ b/drivers/media/firewire/firedtv-avc.c @@ -968,7 +968,8 @@ static int get_ca_object_length(struct avc_response_frame *r) return r->operand[7]; }
-int avc_ca_app_info(struct firedtv *fdtv, char *app_info, unsigned int *len) +int avc_ca_app_info(struct firedtv *fdtv, unsigned char *app_info, + unsigned int *len) { struct avc_command_frame *c = (void *)fdtv->avc_data; struct avc_response_frame *r = (void *)fdtv->avc_data; @@ -1009,7 +1010,8 @@ int avc_ca_app_info(struct firedtv *fdtv, char *app_info, unsigned int *len) return ret; }
-int avc_ca_info(struct firedtv *fdtv, char *app_info, unsigned int *len) +int avc_ca_info(struct firedtv *fdtv, unsigned char *app_info, + unsigned int *len) { struct avc_command_frame *c = (void *)fdtv->avc_data; struct avc_response_frame *r = (void *)fdtv->avc_data; diff --git a/drivers/media/firewire/firedtv.h b/drivers/media/firewire/firedtv.h index 876cdec8329b..009905a19947 100644 --- a/drivers/media/firewire/firedtv.h +++ b/drivers/media/firewire/firedtv.h @@ -124,8 +124,10 @@ int avc_lnb_control(struct firedtv *fdtv, char voltage, char burst, struct dvb_diseqc_master_cmd *diseqcmd); void avc_remote_ctrl_work(struct work_struct *work); int avc_register_remote_control(struct firedtv *fdtv); -int avc_ca_app_info(struct firedtv *fdtv, char *app_info, unsigned int *len); -int avc_ca_info(struct firedtv *fdtv, char *app_info, unsigned int *len); +int avc_ca_app_info(struct firedtv *fdtv, unsigned char *app_info, + unsigned int *len); +int avc_ca_info(struct firedtv *fdtv, unsigned char *app_info, + unsigned int *len); int avc_ca_reset(struct firedtv *fdtv); int avc_ca_pmt(struct firedtv *fdtv, char *app_info, int length); int avc_ca_get_time_date(struct firedtv *fdtv, int *interval);
From: Yu Zhao yuzhao@google.com
[ Upstream commit 23aa128bb28d9da69bb1bdb2b70e50128857884a ]
AMD platform device acp_audio_dma can only be created by parent PCI device driver (drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c). Pass struct device of the parent to snd_pcm_lib_preallocate_pages() so dma_alloc_coherent() can use correct dma_ops. Otherwise, it will use default dma_ops which is nommu_dma_ops on x86_64 even when IOMMU is enabled and set to non passthrough mode.
Though platform device inherits some dma related fields during its creation in mfd_add_device(), we can't simply pass its struct device to snd_pcm_lib_preallocate_pages() because dma_ops is not among the inherited fields. Even it were, drivers/iommu/amd_iommu.c would ignore it because get_device_id() doesn't handle platform device.
This change shouldn't give us any trouble even struct device of the parent becomes null or represents some non PCI device in the future, because get_dma_ops() correctly handles null struct device or uses the default dma_ops if struct device doesn't have it set.
Signed-off-by: Yu Zhao yuzhao@google.com Signed-off-by: Mark Brown broonie@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- sound/soc/amd/acp-pcm-dma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/sound/soc/amd/acp-pcm-dma.c b/sound/soc/amd/acp-pcm-dma.c index 3135e9eafd18..7f376b63a166 100644 --- a/sound/soc/amd/acp-pcm-dma.c +++ b/sound/soc/amd/acp-pcm-dma.c @@ -1147,18 +1147,21 @@ static int acp_dma_new(struct snd_soc_pcm_runtime *rtd) struct snd_soc_component *component = snd_soc_rtdcom_lookup(rtd, DRV_NAME); struct audio_drv_data *adata = dev_get_drvdata(component->dev); + struct device *parent = component->dev->parent;
switch (adata->asic_type) { case CHIP_STONEY: ret = snd_pcm_lib_preallocate_pages_for_all(rtd->pcm, SNDRV_DMA_TYPE_DEV, - NULL, ST_MIN_BUFFER, + parent, + ST_MIN_BUFFER, ST_MAX_BUFFER); break; default: ret = snd_pcm_lib_preallocate_pages_for_all(rtd->pcm, SNDRV_DMA_TYPE_DEV, - NULL, MIN_BUFFER, + parent, + MIN_BUFFER, MAX_BUFFER); break; }
From: "Michael J. Ruhl" michael.j.ruhl@intel.com
[ Upstream commit dbc2970caef74e8ff41923d302aa6fb5a4812d0e ]
An incorrect sge sizing in the HFI PIO path will cause an OOPs similar to this:
BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] hfi1_verbs_send_pio+0x3d8/0x530 [hfi1] PGD 0 Oops: 0000 1 SMP Call Trace: ? hfi1_verbs_send_dma+0xad0/0xad0 [hfi1] hfi1_verbs_send+0xdf/0x250 [hfi1] ? make_rc_ack+0xa80/0xa80 [hfi1] hfi1_do_send+0x192/0x430 [hfi1] hfi1_do_send_from_rvt+0x10/0x20 [hfi1] rvt_post_send+0x369/0x820 [rdmavt] ib_uverbs_post_send+0x317/0x570 [ib_uverbs] ib_uverbs_write+0x26f/0x420 [ib_uverbs] ? security_file_permission+0x21/0xa0 vfs_write+0xbd/0x1e0 ? mntput+0x24/0x40 SyS_write+0x7f/0xe0 system_call_fastpath+0x16/0x1b
Fix by adding the missing sizing check to correctly determine the sge length.
Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Mike Marciniszyn mike.marciniszyn@intel.com Signed-off-by: Michael J. Ruhl michael.j.ruhl@intel.com Signed-off-by: Dennis Dalessandro dennis.dalessandro@intel.com Signed-off-by: Jason Gunthorpe jgg@mellanox.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/infiniband/hw/hfi1/verbs.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 3dfb4cf2f8c9..48692adbe811 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1141,6 +1141,8 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
if (slen > len) slen = len; + if (slen > ss->sge.sge_length) + slen = ss->sge.sge_length; rvt_update_sge(ss, slen, false); seg_pio_copy_mid(pbuf, addr, slen); len -= slen;
From: Vivek Gautam vivek.gautam@codeaurora.org
[ Upstream commit de2563bce7a157f5296bab94f3843d7d64fb14b4 ]
Turning on CONFIG_DMA_API_DEBUG_SG results in the following error:
[ 460.308650] ------------[ cut here ]------------ [ 460.313490] qcom-venus aa00000.video-codec: DMA-API: mapping sg segment longer than device claims to support [len=4194304] [max=65536] [ 460.326017] WARNING: CPU: 3 PID: 3555 at src/kernel/dma/debug.c:1301 debug_dma_map_sg+0x174/0x254 [ 460.338888] Modules linked in: venus_dec venus_enc videobuf2_dma_sg videobuf2_memops hci_uart btqca bluetooth venus_core v4l2_mem2mem videobuf2_v4l2 videobuf2_common ath10k_snoc ath10k_core ath lzo lzo_compress zramjoydev [ 460.375811] CPU: 3 PID: 3555 Comm: V4L2DecoderThre Tainted: G W 4.19.1 #82 [ 460.384223] Hardware name: Google Cheza (rev1) (DT) [ 460.389251] pstate: 60400009 (nZCv daif +PAN -UAO) [ 460.394191] pc : debug_dma_map_sg+0x174/0x254 [ 460.398680] lr : debug_dma_map_sg+0x174/0x254 [ 460.403162] sp : ffffff80200c37d0 [ 460.406583] x29: ffffff80200c3830 x28: 0000000000010000 [ 460.412056] x27: 00000000ffffffff x26: ffffffc0f785ea80 [ 460.417532] x25: 0000000000000000 x24: ffffffc0f4ea1290 [ 460.423001] x23: ffffffc09e700300 x22: ffffffc0f4ea1290 [ 460.428470] x21: ffffff8009037000 x20: 0000000000000001 [ 460.433936] x19: ffffff80091b0000 x18: 0000000000000000 [ 460.439411] x17: 0000000000000000 x16: 000000000000f251 [ 460.444885] x15: 0000000000000006 x14: 0720072007200720 [ 460.450354] x13: ffffff800af536e0 x12: 0000000000000000 [ 460.455822] x11: 0000000000000000 x10: 0000000000000000 [ 460.461288] x9 : 537944d9c6c48d00 x8 : 537944d9c6c48d00 [ 460.466758] x7 : 0000000000000000 x6 : ffffffc0f8d98f80 [ 460.472230] x5 : 0000000000000000 x4 : 0000000000000000 [ 460.477703] x3 : 000000000000008a x2 : ffffffc0fdb13948 [ 460.483170] x1 : ffffffc0fdb0b0b0 x0 : 000000000000007a [ 460.488640] Call trace: [ 460.491165] debug_dma_map_sg+0x174/0x254 [ 460.495307] vb2_dma_sg_alloc+0x260/0x2dc [videobuf2_dma_sg] [ 460.501150] __vb2_queue_alloc+0x164/0x374 [videobuf2_common] [ 460.507076] vb2_core_reqbufs+0xfc/0x23c [videobuf2_common] [ 460.512815] vb2_reqbufs+0x44/0x5c [videobuf2_v4l2] [ 460.517853] v4l2_m2m_reqbufs+0x44/0x78 [v4l2_mem2mem] [ 460.523144] v4l2_m2m_ioctl_reqbufs+0x1c/0x28 [v4l2_mem2mem] [ 460.528976] v4l_reqbufs+0x30/0x40 [ 460.532480] __video_do_ioctl+0x36c/0x454 [ 460.536610] video_usercopy+0x25c/0x51c [ 460.540572] video_ioctl2+0x38/0x48 [ 460.544176] v4l2_ioctl+0x60/0x74 [ 460.547602] do_video_ioctl+0x948/0x3520 [ 460.551648] v4l2_compat_ioctl32+0x60/0x98 [ 460.555872] __arm64_compat_sys_ioctl+0x134/0x20c [ 460.560718] el0_svc_common+0x9c/0xe4 [ 460.564498] el0_svc_compat_handler+0x2c/0x38 [ 460.568982] el0_svc_compat+0x8/0x18 [ 460.572672] ---[ end trace ce209b87b2f3af88 ]---
From above warning one would deduce that the sg segment will overflow
the device's capacity. In reality, the hardware can accommodate larger sg segments. So, initialize the max segment size properly to weed out this warning.
Based on a similar patch sent by Sean Paul for mdss: https://patchwork.kernel.org/patch/10671457/
Signed-off-by: Vivek Gautam vivek.gautam@codeaurora.org Acked-by: Stanimir Varbanov stanimir.varbanov@linaro.org Signed-off-by: Hans Verkuil hverkuil-cisco@xs4all.nl Signed-off-by: Mauro Carvalho Chehab mchehab+samsung@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/media/platform/qcom/venus/core.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/media/platform/qcom/venus/core.c b/drivers/media/platform/qcom/venus/core.c index bb6add9d340e..5b8350e87e75 100644 --- a/drivers/media/platform/qcom/venus/core.c +++ b/drivers/media/platform/qcom/venus/core.c @@ -264,6 +264,14 @@ static int venus_probe(struct platform_device *pdev) if (ret) return ret;
+ if (!dev->dma_parms) { + dev->dma_parms = devm_kzalloc(dev, sizeof(*dev->dma_parms), + GFP_KERNEL); + if (!dev->dma_parms) + return -ENOMEM; + } + dma_set_max_seg_size(dev, DMA_BIT_MASK(32)); + INIT_LIST_HEAD(&core->instances); mutex_init(&core->lock); INIT_DELAYED_WORK(&core->work, venus_sys_error_handler);
From: Gao Xiang gaoxiang25@huawei.com
[ Upstream commit 848bd9acdcd00c164b42b14aacec242949ecd471 ]
The root cause is the race as follows: Thread #0 Thread #1
z_erofs_vle_unzip_kickoff z_erofs_submit_and_unzip
struct z_erofs_vle_unzip_io io[] atomic_add_return() wait_event() [end of function] wake_up()
Fix it by taking the waitqueue lock between atomic_add_return and wake_up to close such the race.
kernel message:
Unable to handle kernel paging request at virtual address 97f7052caa1303dc ... Workqueue: kverityd verity_work task: ffffffe32bcb8000 task.stack: ffffffe3298a0000 PC is at __wake_up_common+0x48/0xa8 LR is at __wake_up+0x3c/0x58 ... Call trace: ... [<ffffff94a08ff648>] __wake_up_common+0x48/0xa8 [<ffffff94a08ff8b8>] __wake_up+0x3c/0x58 [<ffffff94a0c11b60>] z_erofs_vle_unzip_kickoff+0x40/0x64 [<ffffff94a0c118e4>] z_erofs_vle_read_endio+0x94/0x134 [<ffffff94a0c83c9c>] bio_endio+0xe4/0xf8 [<ffffff94a1076540>] dec_pending+0x134/0x32c [<ffffff94a1076f28>] clone_endio+0x90/0xf4 [<ffffff94a0c83c9c>] bio_endio+0xe4/0xf8 [<ffffff94a1095024>] verity_work+0x210/0x368 [<ffffff94a08c4150>] process_one_work+0x188/0x4b4 [<ffffff94a08c45bc>] worker_thread+0x140/0x458 [<ffffff94a08cad48>] kthread+0xec/0x108 [<ffffff94a0883ab4>] ret_from_fork+0x10/0x1c Code: d1006273 54000260 f9400804 b9400019 (b85fc081) ---[ end trace be9dde154f677cd1 ]---
Reviewed-by: Chao Yu yuchao0@huawei.com Signed-off-by: Gao Xiang gaoxiang25@huawei.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/staging/erofs/unzip_vle.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c index 14da8cc2246a..0346630b67c8 100644 --- a/drivers/staging/erofs/unzip_vle.c +++ b/drivers/staging/erofs/unzip_vle.c @@ -724,13 +724,18 @@ static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t); bool background = tagptr_unfold_tags(t);
- if (atomic_add_return(bios, &io->pending_bios)) + if (!background) { + unsigned long flags; + + spin_lock_irqsave(&io->u.wait.lock, flags); + if (!atomic_add_return(bios, &io->pending_bios)) + wake_up_locked(&io->u.wait); + spin_unlock_irqrestore(&io->u.wait.lock, flags); return; + }
- if (background) + if (!atomic_add_return(bios, &io->pending_bios)) queue_work(z_erofs_workqueue, &io->u.work); - else - wake_up(&io->u.wait); }
static inline void z_erofs_vle_read_endio(struct bio *bio)
From: yupeng yupeng0921@gmail.com
[ Upstream commit 0fbe82e628c817e292ff588cd5847fc935e025f2 ]
after set SO_DONTROUTE to 1, the IP layer should not route packets if the dest IP address is not in link scope. But if the socket has cached the dst_entry, such packets would be routed until the sk_dst_cache expires. So we should clean the sk_dst_cache when a user set SO_DONTROUTE option. Below are server/client python scripts which could reprodue this issue:
server side code:
========================================================================== import socket import struct import time
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(('0.0.0.0', 9000)) s.listen(1) sock, addr = s.accept() sock.setsockopt(socket.SOL_SOCKET, socket.SO_DONTROUTE, struct.pack('i', 1)) while True: sock.send(b'foo') time.sleep(1) ==========================================================================
client side code: ========================================================================== import socket import time
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(('server_address', 9000)) while True: data = s.recv(1024) print(data) ==========================================================================
Signed-off-by: yupeng yupeng0921@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/core/sock.c b/net/core/sock.c index 748765e35423..99bdfb7930ea 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -698,6 +698,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); + sk_dst_reset(sk); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
From: David Disseldorp ddiss@suse.de
[ Upstream commit 0de263577de5d5e052be5f4f93334e63cc8a7f0b ]
spc5r17.pdf specifies:
4.3.1 ASCII data field requirements ASCII data fields shall contain only ASCII printable characters (i.e., code values 20h to 7Eh) and may be terminated with one or more ASCII null (00h) characters. ASCII data fields described as being left-aligned shall have any unused bytes at the end of the field (i.e., highest offset) and the unused bytes shall be filled with ASCII space characters (20h).
LIO currently space-pads the T10 VENDOR IDENTIFICATION and PRODUCT IDENTIFICATION fields in the standard INQUIRY data. However, the PRODUCT REVISION LEVEL field in the standard INQUIRY data as well as the T10 VENDOR IDENTIFICATION field in the INQUIRY Device Identification VPD Page are zero-terminated/zero-padded.
Fix this inconsistency by using space-padding for all of the above fields.
Signed-off-by: David Disseldorp ddiss@suse.de Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Bryant G. Ly bly@catalogicsoftware.com Reviewed-by: Lee Duncan lduncan@suse.com Reviewed-by: Hannes Reinecke hare@suse.com Reviewed-by: Roman Bolshakov r.bolshakov@yadro.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/target/target_core_spc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/target/target_core_spc.c b/drivers/target/target_core_spc.c index cb0461a10808..93424db5f002 100644 --- a/drivers/target/target_core_spc.c +++ b/drivers/target/target_core_spc.c @@ -108,12 +108,17 @@ spc_emulate_inquiry_std(struct se_cmd *cmd, unsigned char *buf)
buf[7] = 0x2; /* CmdQue=1 */
- memcpy(&buf[8], "LIO-ORG ", 8); - memset(&buf[16], 0x20, 16); + /* + * ASCII data fields described as being left-aligned shall have any + * unused bytes at the end of the field (i.e., highest offset) and the + * unused bytes shall be filled with ASCII space characters (20h). + */ + memset(&buf[8], 0x20, 8 + 16 + 4); + memcpy(&buf[8], "LIO-ORG", sizeof("LIO-ORG") - 1); memcpy(&buf[16], dev->t10_wwn.model, - min_t(size_t, strlen(dev->t10_wwn.model), 16)); + strnlen(dev->t10_wwn.model, 16)); memcpy(&buf[32], dev->t10_wwn.revision, - min_t(size_t, strlen(dev->t10_wwn.revision), 4)); + strnlen(dev->t10_wwn.revision, 4)); buf[4] = 31; /* Set additional length to 31 */
return 0; @@ -251,7 +256,9 @@ spc_emulate_evpd_83(struct se_cmd *cmd, unsigned char *buf) buf[off] = 0x2; /* ASCII */ buf[off+1] = 0x1; /* T10 Vendor ID */ buf[off+2] = 0x0; - memcpy(&buf[off+4], "LIO-ORG", 8); + /* left align Vendor ID and pad with spaces */ + memset(&buf[off+4], 0x20, 8); + memcpy(&buf[off+4], "LIO-ORG", sizeof("LIO-ORG") - 1); /* Extra Byte for NULL Terminator */ id_len++; /* Identifier Length */
From: Bart Van Assche bvanassche@acm.org
[ Upstream commit ad669505c4e9db9af9faeb5c51aa399326a80d91 ]
A session must only be released after all code that accesses the session structure has finished. Make sure that this is the case by introducing a new command counter per session that is only decremented after the .release_cmd() callback has finished. This patch fixes the following crash:
BUG: KASAN: use-after-free in do_raw_spin_lock+0x1c/0x130 Read of size 4 at addr ffff8801534b16e4 by task rmdir/14805 CPU: 16 PID: 14805 Comm: rmdir Not tainted 4.18.0-rc2-dbg+ #5 Call Trace: dump_stack+0xa4/0xf5 print_address_description+0x6f/0x270 kasan_report+0x241/0x360 __asan_load4+0x78/0x80 do_raw_spin_lock+0x1c/0x130 _raw_spin_lock_irqsave+0x52/0x60 srpt_set_ch_state+0x27/0x70 [ib_srpt] srpt_disconnect_ch+0x1b/0xc0 [ib_srpt] srpt_close_session+0xa8/0x260 [ib_srpt] target_shutdown_sessions+0x170/0x180 [target_core_mod] core_tpg_del_initiator_node_acl+0xf3/0x200 [target_core_mod] target_fabric_nacl_base_release+0x25/0x30 [target_core_mod] config_item_release+0x9c/0x110 [configfs] config_item_put+0x26/0x30 [configfs] configfs_rmdir+0x3b8/0x510 [configfs] vfs_rmdir+0xb3/0x1e0 do_rmdir+0x262/0x2c0 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Cc: Nicholas Bellinger nab@linux-iscsi.org Cc: Mike Christie mchristi@redhat.com Cc: Christoph Hellwig hch@lst.de Cc: David Disseldorp ddiss@suse.de Cc: Hannes Reinecke hare@suse.de Signed-off-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/target/target_core_transport.c | 35 ++++++++++++++++++-------- drivers/target/target_core_xcopy.c | 6 ++++- include/target/target_core_base.h | 1 + include/target/target_core_fabric.h | 2 +- 4 files changed, 32 insertions(+), 12 deletions(-)
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index fc3093d21b96..3f7aad45d215 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -224,19 +224,28 @@ void transport_subsystem_check_init(void) sub_api_initialized = 1; }
+static void target_release_sess_cmd_refcnt(struct percpu_ref *ref) +{ + struct se_session *sess = container_of(ref, typeof(*sess), cmd_count); + + wake_up(&sess->cmd_list_wq); +} + /** * transport_init_session - initialize a session object * @se_sess: Session object pointer. * * The caller must have zero-initialized @se_sess before calling this function. */ -void transport_init_session(struct se_session *se_sess) +int transport_init_session(struct se_session *se_sess) { INIT_LIST_HEAD(&se_sess->sess_list); INIT_LIST_HEAD(&se_sess->sess_acl_list); INIT_LIST_HEAD(&se_sess->sess_cmd_list); spin_lock_init(&se_sess->sess_cmd_lock); init_waitqueue_head(&se_sess->cmd_list_wq); + return percpu_ref_init(&se_sess->cmd_count, + target_release_sess_cmd_refcnt, 0, GFP_KERNEL); } EXPORT_SYMBOL(transport_init_session);
@@ -247,6 +256,7 @@ EXPORT_SYMBOL(transport_init_session); struct se_session *transport_alloc_session(enum target_prot_op sup_prot_ops) { struct se_session *se_sess; + int ret;
se_sess = kmem_cache_zalloc(se_sess_cache, GFP_KERNEL); if (!se_sess) { @@ -254,7 +264,11 @@ struct se_session *transport_alloc_session(enum target_prot_op sup_prot_ops) " se_sess_cache\n"); return ERR_PTR(-ENOMEM); } - transport_init_session(se_sess); + ret = transport_init_session(se_sess); + if (ret < 0) { + kfree(se_sess); + return ERR_PTR(ret); + } se_sess->sup_prot_ops = sup_prot_ops;
return se_sess; @@ -581,6 +595,7 @@ void transport_free_session(struct se_session *se_sess) sbitmap_queue_free(&se_sess->sess_tag_pool); kvfree(se_sess->sess_cmd_map); } + percpu_ref_exit(&se_sess->cmd_count); kmem_cache_free(se_sess_cache, se_sess); } EXPORT_SYMBOL(transport_free_session); @@ -2724,6 +2739,7 @@ int target_get_sess_cmd(struct se_cmd *se_cmd, bool ack_kref) } se_cmd->transport_state |= CMD_T_PRE_EXECUTE; list_add_tail(&se_cmd->se_cmd_list, &se_sess->sess_cmd_list); + percpu_ref_get(&se_sess->cmd_count); out: spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
@@ -2754,8 +2770,6 @@ static void target_release_cmd_kref(struct kref *kref) if (se_sess) { spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); list_del_init(&se_cmd->se_cmd_list); - if (se_sess->sess_tearing_down && list_empty(&se_sess->sess_cmd_list)) - wake_up(&se_sess->cmd_list_wq); spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags); }
@@ -2763,6 +2777,8 @@ static void target_release_cmd_kref(struct kref *kref) se_cmd->se_tfo->release_cmd(se_cmd); if (compl) complete(compl); + + percpu_ref_put(&se_sess->cmd_count); }
/** @@ -2891,6 +2907,8 @@ void target_sess_cmd_list_set_waiting(struct se_session *se_sess) spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); se_sess->sess_tearing_down = 1; spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags); + + percpu_ref_kill(&se_sess->cmd_count); } EXPORT_SYMBOL(target_sess_cmd_list_set_waiting);
@@ -2905,17 +2923,14 @@ void target_wait_for_sess_cmds(struct se_session *se_sess)
WARN_ON_ONCE(!se_sess->sess_tearing_down);
- spin_lock_irq(&se_sess->sess_cmd_lock); do { - ret = wait_event_lock_irq_timeout( - se_sess->cmd_list_wq, - list_empty(&se_sess->sess_cmd_list), - se_sess->sess_cmd_lock, 180 * HZ); + ret = wait_event_timeout(se_sess->cmd_list_wq, + percpu_ref_is_zero(&se_sess->cmd_count), + 180 * HZ); list_for_each_entry(cmd, &se_sess->sess_cmd_list, se_cmd_list) target_show_cmd("session shutdown: still waiting for ", cmd); } while (ret <= 0); - spin_unlock_irq(&se_sess->sess_cmd_lock); } EXPORT_SYMBOL(target_wait_for_sess_cmds);
diff --git a/drivers/target/target_core_xcopy.c b/drivers/target/target_core_xcopy.c index 2718a933c0c6..7cdb5d7f6538 100644 --- a/drivers/target/target_core_xcopy.c +++ b/drivers/target/target_core_xcopy.c @@ -480,6 +480,8 @@ static const struct target_core_fabric_ops xcopy_pt_tfo = {
int target_xcopy_setup_pt(void) { + int ret; + xcopy_wq = alloc_workqueue("xcopy_wq", WQ_MEM_RECLAIM, 0); if (!xcopy_wq) { pr_err("Unable to allocate xcopy_wq\n"); @@ -497,7 +499,9 @@ int target_xcopy_setup_pt(void) INIT_LIST_HEAD(&xcopy_pt_nacl.acl_list); INIT_LIST_HEAD(&xcopy_pt_nacl.acl_sess_list); memset(&xcopy_pt_sess, 0, sizeof(struct se_session)); - transport_init_session(&xcopy_pt_sess); + ret = transport_init_session(&xcopy_pt_sess); + if (ret < 0) + return ret;
xcopy_pt_nacl.se_tpg = &xcopy_pt_tpg; xcopy_pt_nacl.nacl_sess = &xcopy_pt_sess; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 7a4ee7852ca4..2cfd3b4573b0 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -602,6 +602,7 @@ struct se_session { struct se_node_acl *se_node_acl; struct se_portal_group *se_tpg; void *fabric_sess_ptr; + struct percpu_ref cmd_count; struct list_head sess_list; struct list_head sess_acl_list; struct list_head sess_cmd_list; diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index f4147b398431..eb9d0923c55c 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -116,7 +116,7 @@ struct se_session *target_setup_session(struct se_portal_group *, struct se_session *, void *)); void target_remove_session(struct se_session *);
-void transport_init_session(struct se_session *); +int transport_init_session(struct se_session *se_sess); struct se_session *transport_alloc_session(enum target_prot_op); int transport_alloc_session_tags(struct se_session *, unsigned int, unsigned int);
From: "Dmitry V. Levin" ldv@altlinux.org
[ Upstream commit b708a3cc9600390ccaa2b68a88087dd265154b2b ]
I've stumbled over the current macro-expand behaviour of the test harness:
$ gcc -Wall -xc - <<'__EOF__' TEST(macro) { int status = 0; ASSERT_TRUE(WIFSIGNALED(status)); } TEST_HARNESS_MAIN __EOF__ $ ./a.out [==========] Running 1 tests from 1 test cases. [ RUN ] global.macro <stdin>:4:global.macro:Expected 0 (0) != (((signed char) (((status) & 0x7f) + 1) >> 1) > 0) (0) global.macro: Test terminated by assertion [ FAIL ] global.macro [==========] 0 / 1 tests passed. [ FAILED ]
With this change the output of the same test looks much more comprehensible:
[==========] Running 1 tests from 1 test cases. [ RUN ] global.macro <stdin>:4:global.macro:Expected 0 (0) != WIFSIGNALED(status) (0) global.macro: Test terminated by assertion [ FAIL ] global.macro [==========] 0 / 1 tests passed. [ FAILED ]
The issue is very similar to the bug fixed in glibc assert(3) three years ago: https://sourceware.org/bugzilla/show_bug.cgi?id=18604
Cc: Shuah Khan shuah@kernel.org Cc: Kees Cook keescook@chromium.org Cc: Andy Lutomirski luto@amacapital.net Cc: Will Drewry wad@chromium.org Cc: linux-kselftest@vger.kernel.org Signed-off-by: Dmitry V. Levin ldv@altlinux.org Acked-by: Kees Cook keescook@chromium.org Signed-off-by: Shuah Khan shuah@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- tools/testing/selftests/kselftest_harness.h | 42 ++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-)
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 6ae3730c4ee3..76d654ef3234 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -354,7 +354,7 @@ * ASSERT_EQ(expected, measured): expected == measured */ #define ASSERT_EQ(expected, seen) \ - __EXPECT(expected, seen, ==, 1) + __EXPECT(expected, #expected, seen, #seen, ==, 1)
/** * ASSERT_NE(expected, seen) @@ -365,7 +365,7 @@ * ASSERT_NE(expected, measured): expected != measured */ #define ASSERT_NE(expected, seen) \ - __EXPECT(expected, seen, !=, 1) + __EXPECT(expected, #expected, seen, #seen, !=, 1)
/** * ASSERT_LT(expected, seen) @@ -376,7 +376,7 @@ * ASSERT_LT(expected, measured): expected < measured */ #define ASSERT_LT(expected, seen) \ - __EXPECT(expected, seen, <, 1) + __EXPECT(expected, #expected, seen, #seen, <, 1)
/** * ASSERT_LE(expected, seen) @@ -387,7 +387,7 @@ * ASSERT_LE(expected, measured): expected <= measured */ #define ASSERT_LE(expected, seen) \ - __EXPECT(expected, seen, <=, 1) + __EXPECT(expected, #expected, seen, #seen, <=, 1)
/** * ASSERT_GT(expected, seen) @@ -398,7 +398,7 @@ * ASSERT_GT(expected, measured): expected > measured */ #define ASSERT_GT(expected, seen) \ - __EXPECT(expected, seen, >, 1) + __EXPECT(expected, #expected, seen, #seen, >, 1)
/** * ASSERT_GE(expected, seen) @@ -409,7 +409,7 @@ * ASSERT_GE(expected, measured): expected >= measured */ #define ASSERT_GE(expected, seen) \ - __EXPECT(expected, seen, >=, 1) + __EXPECT(expected, #expected, seen, #seen, >=, 1)
/** * ASSERT_NULL(seen) @@ -419,7 +419,7 @@ * ASSERT_NULL(measured): NULL == measured */ #define ASSERT_NULL(seen) \ - __EXPECT(NULL, seen, ==, 1) + __EXPECT(NULL, "NULL", seen, #seen, ==, 1)
/** * ASSERT_TRUE(seen) @@ -429,7 +429,7 @@ * ASSERT_TRUE(measured): measured != 0 */ #define ASSERT_TRUE(seen) \ - ASSERT_NE(0, seen) + __EXPECT(0, "0", seen, #seen, !=, 1)
/** * ASSERT_FALSE(seen) @@ -439,7 +439,7 @@ * ASSERT_FALSE(measured): measured == 0 */ #define ASSERT_FALSE(seen) \ - ASSERT_EQ(0, seen) + __EXPECT(0, "0", seen, #seen, ==, 1)
/** * ASSERT_STREQ(expected, seen) @@ -472,7 +472,7 @@ * EXPECT_EQ(expected, measured): expected == measured */ #define EXPECT_EQ(expected, seen) \ - __EXPECT(expected, seen, ==, 0) + __EXPECT(expected, #expected, seen, #seen, ==, 0)
/** * EXPECT_NE(expected, seen) @@ -483,7 +483,7 @@ * EXPECT_NE(expected, measured): expected != measured */ #define EXPECT_NE(expected, seen) \ - __EXPECT(expected, seen, !=, 0) + __EXPECT(expected, #expected, seen, #seen, !=, 0)
/** * EXPECT_LT(expected, seen) @@ -494,7 +494,7 @@ * EXPECT_LT(expected, measured): expected < measured */ #define EXPECT_LT(expected, seen) \ - __EXPECT(expected, seen, <, 0) + __EXPECT(expected, #expected, seen, #seen, <, 0)
/** * EXPECT_LE(expected, seen) @@ -505,7 +505,7 @@ * EXPECT_LE(expected, measured): expected <= measured */ #define EXPECT_LE(expected, seen) \ - __EXPECT(expected, seen, <=, 0) + __EXPECT(expected, #expected, seen, #seen, <=, 0)
/** * EXPECT_GT(expected, seen) @@ -516,7 +516,7 @@ * EXPECT_GT(expected, measured): expected > measured */ #define EXPECT_GT(expected, seen) \ - __EXPECT(expected, seen, >, 0) + __EXPECT(expected, #expected, seen, #seen, >, 0)
/** * EXPECT_GE(expected, seen) @@ -527,7 +527,7 @@ * EXPECT_GE(expected, measured): expected >= measured */ #define EXPECT_GE(expected, seen) \ - __EXPECT(expected, seen, >=, 0) + __EXPECT(expected, #expected, seen, #seen, >=, 0)
/** * EXPECT_NULL(seen) @@ -537,7 +537,7 @@ * EXPECT_NULL(measured): NULL == measured */ #define EXPECT_NULL(seen) \ - __EXPECT(NULL, seen, ==, 0) + __EXPECT(NULL, "NULL", seen, #seen, ==, 0)
/** * EXPECT_TRUE(seen) @@ -547,7 +547,7 @@ * EXPECT_TRUE(measured): 0 != measured */ #define EXPECT_TRUE(seen) \ - EXPECT_NE(0, seen) + __EXPECT(0, "0", seen, #seen, !=, 0)
/** * EXPECT_FALSE(seen) @@ -557,7 +557,7 @@ * EXPECT_FALSE(measured): 0 == measured */ #define EXPECT_FALSE(seen) \ - EXPECT_EQ(0, seen) + __EXPECT(0, "0", seen, #seen, ==, 0)
/** * EXPECT_STREQ(expected, seen) @@ -597,7 +597,7 @@ if (_metadata->passed && _metadata->step < 255) \ _metadata->step++;
-#define __EXPECT(_expected, _seen, _t, _assert) do { \ +#define __EXPECT(_expected, _expected_str, _seen, _seen_str, _t, _assert) do { \ /* Avoid multiple evaluation of the cases */ \ __typeof__(_expected) __exp = (_expected); \ __typeof__(_seen) __seen = (_seen); \ @@ -606,8 +606,8 @@ unsigned long long __exp_print = (uintptr_t)__exp; \ unsigned long long __seen_print = (uintptr_t)__seen; \ __TH_LOG("Expected %s (%llu) %s %s (%llu)", \ - #_expected, __exp_print, #_t, \ - #_seen, __seen_print); \ + _expected_str, __exp_print, #_t, \ + _seen_str, __seen_print); \ _metadata->passed = 0; \ /* Ensure the optional handler is triggered */ \ _metadata->trigger = 1; \
From: Qian Cai cai@lca.pw
[ Upstream commit 6e8830674ea77f57d57a33cca09083b117a71f41 ]
If the kernel is configured with KASAN_EXTRA, the stack size is increased significantly due to setting the GCC -fstack-reuse option to "none" [1]. As a result, it can trigger a stack overrun quite often with 32k stack size compiled using GCC 8. For example, this reproducer
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/sysca...
can trigger a "corrupted stack end detected inside scheduler" very reliably with CONFIG_SCHED_STACK_END_CHECK enabled. There are other reports at:
https://lore.kernel.org/lkml/1542144497.12945.29.camel@gmx.us/ https://lore.kernel.org/lkml/721E7B42-2D55-4866-9C1A-3E8D64F33F9C@gmx.us/
There are just too many functions that could have a large stack with KASAN_EXTRA due to large local variables that have been called over and over again without being able to reuse the stacks. Some noticiable ones are,
size 7536 shrink_inactive_list 7440 shrink_page_list 6560 fscache_stats_show 3920 jbd2_journal_commit_transaction 3216 try_to_unmap_one 3072 migrate_page_move_mapping 3584 migrate_misplaced_transhuge_page 3920 ip_vs_lblcr_schedule 4304 lpfc_nvme_info_show 3888 lpfc_debugfs_nvmestat_data.constprop
There are other 49 functions over 2k in size while compiling kernel with "-Wframe-larger-than=" on this machine. Hence, it is too much work to change Makefiles for each object to compile without -fsanitize-address-use-after-scope individually.
[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715#c23
Signed-off-by: Qian Cai cai@lca.pw Signed-off-by: Will Deacon will.deacon@arm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/include/asm/memory.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index b96442960aea..56562ff01076 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -76,12 +76,17 @@ /* * KASAN requires 1/8th of the kernel virtual address space for the shadow * region. KASAN can bloat the stack significantly, so double the (minimum) - * stack size when KASAN is in use. + * stack size when KASAN is in use, and then double it again if KASAN_EXTRA is + * on. */ #ifdef CONFIG_KASAN #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) +#ifdef CONFIG_KASAN_EXTRA +#define KASAN_THREAD_SHIFT 2 +#else #define KASAN_THREAD_SHIFT 1 +#endif /* CONFIG_KASAN_EXTRA */ #else #define KASAN_SHADOW_SIZE (0) #define KASAN_THREAD_SHIFT 0
From: Lucas Stach l.stach@pengutronix.de
[ Upstream commit f7542d817733f461258fd3a47d77da35b2d9fc81 ]
The exclusive gates may be set up in the wrong way by software running before the clock driver comes up. In that case the exclusive setup is locked in its initial state, as the complementary function can't be activated without disabling the initial setup first.
To avoid this lock situation, reset the exclusive gates to the off state and allow the kernel to provide the proper setup.
Signed-off-by: Lucas Stach l.stach@pengutronix.de Reviewed-by: Dong Aisheng Aisheng.dong@nxp.com Signed-off-by: Stephen Boyd sboyd@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/clk/imx/clk-imx6q.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/clk/imx/clk-imx6q.c b/drivers/clk/imx/clk-imx6q.c index 8c7c2fcb8d94..c509324f6338 100644 --- a/drivers/clk/imx/clk-imx6q.c +++ b/drivers/clk/imx/clk-imx6q.c @@ -508,8 +508,12 @@ static void __init imx6q_clocks_init(struct device_node *ccm_node) * lvds1_gate and lvds2_gate are pseudo-gates. Both can be * independently configured as clock inputs or outputs. We treat * the "output_enable" bit as a gate, even though it's really just - * enabling clock output. + * enabling clock output. Initially the gate bits are cleared, as + * otherwise the exclusive configuration gets locked in the setup done + * by software running before the clock driver, with no way to change + * it. */ + writel(readl(base + 0x160) & ~0x3c00, base + 0x160); clk[IMX6QDL_CLK_LVDS1_GATE] = imx_clk_gate_exclusive("lvds1_gate", "lvds1_sel", base + 0x160, 10, BIT(12)); clk[IMX6QDL_CLK_LVDS2_GATE] = imx_clk_gate_exclusive("lvds2_gate", "lvds2_sel", base + 0x160, 11, BIT(13));
From: Will Deacon will.deacon@arm.com
[ Upstream commit 33309ecda0070506c49182530abe7728850ebe78 ]
The dcache_by_line_op macro suffers from a couple of small problems:
First, the GAS directives that are currently being used rely on assembler behavior that is not documented, and probably not guaranteed to produce the correct behavior going forward. As a result, we end up with some undefined symbols in cache.o:
$ nm arch/arm64/mm/cache.o ... U civac ... U cvac U cvap U cvau
This is due to the fact that the comparisons used to select the operation type in the dcache_by_line_op macro are comparing symbols not strings, and even though it seems that GAS is doing the right thing here (undefined symbols by the same name are equal to each other), it seems unwise to rely on this.
Second, when patching in a DC CVAP instruction on CPUs that support it, the fallback path consists of a DC CVAU instruction which may be affected by CPU errata that require ARM64_WORKAROUND_CLEAN_CACHE.
Solve these issues by unrolling the various maintenance routines and using the conditional directives that are documented as operating on strings. To avoid the complexity of nested alternatives, we move the DC CVAP patching to __clean_dcache_area_pop, falling back to a branch to __clean_dcache_area_poc if DCPOP is not supported by the CPU.
Reported-by: Ard Biesheuvel ard.biesheuvel@linaro.org Suggested-by: Robin Murphy robin.murphy@arm.com Signed-off-by: Will Deacon will.deacon@arm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/include/asm/assembler.h | 30 ++++++++++++++++++------------ arch/arm64/mm/cache.S | 3 +++ 2 files changed, 21 insertions(+), 12 deletions(-)
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 0bcc98dbba56..f90f5d83b228 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -378,27 +378,33 @@ alternative_endif * size: size of the region * Corrupts: kaddr, size, tmp1, tmp2 */ + .macro __dcache_op_workaround_clean_cache, op, kaddr +alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE + dc \op, \kaddr +alternative_else + dc civac, \kaddr +alternative_endif + .endm + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 dcache_line_size \tmp1, \tmp2 add \size, \kaddr, \size sub \tmp2, \tmp1, #1 bic \kaddr, \kaddr, \tmp2 9998: - .if (\op == cvau || \op == cvac) -alternative_if_not ARM64_WORKAROUND_CLEAN_CACHE - dc \op, \kaddr -alternative_else - dc civac, \kaddr -alternative_endif - .elseif (\op == cvap) -alternative_if ARM64_HAS_DCPOP - sys 3, c7, c12, 1, \kaddr // dc cvap -alternative_else - dc cvac, \kaddr -alternative_endif + .ifc \op, cvau + __dcache_op_workaround_clean_cache \op, \kaddr + .else + .ifc \op, cvac + __dcache_op_workaround_clean_cache \op, \kaddr + .else + .ifc \op, cvap + sys 3, c7, c12, 1, \kaddr // dc cvap .else dc \op, \kaddr .endif + .endif + .endif add \kaddr, \kaddr, \tmp1 cmp \kaddr, \size b.lo 9998b diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 0c22ede52f90..a194fd0e837f 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -212,6 +212,9 @@ ENDPROC(__dma_clean_area) * - size - size in question */ ENTRY(__clean_dcache_area_pop) + alternative_if_not ARM64_HAS_DCPOP + b __clean_dcache_area_poc + alternative_else_nop_endif dcache_by_line_op cvap, sy, x0, x1, x2, x3 ret ENDPIPROC(__clean_dcache_area_pop)
From: Jiong Wang jiong.wang@netronome.com
[ Upstream commit e434b8cdf788568ba65a0a0fd9f3cb41f3ca1803 ]
Currently, the destination register is marked as unknown for 32-bit sub-register move (BPF_MOV | BPF_ALU) whenever the source register type is SCALAR_VALUE.
This is too conservative that some valid cases will be rejected. Especially, this may turn a constant scalar value into unknown value that could break some assumptions of verifier.
For example, test_l4lb_noinline.c has the following C code:
struct real_definition *dst
1: if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6)) 2: return TC_ACT_SHOT; 3: 4: if (dst->flags & F_IPV6) {
get_packet_dst is responsible for initializing "dst" into valid pointer and return true (1), otherwise return false (0). The compiled instruction sequence using alu32 will be:
412: (54) (u32) r7 &= (u32) 1 413: (bc) (u32) r0 = (u32) r7 414: (95) exit
insn 413, a BPF_MOV | BPF_ALU, however will turn r0 into unknown value even r7 contains SCALAR_VALUE 1.
This causes trouble when verifier is walking the code path that hasn't initialized "dst" inside get_packet_dst, for which case 0 is returned and we would then expect verifier concluding line 1 in the above C code pass the "if" check, therefore would skip fall through path starting at line 4. Now, because r0 returned from callee has became unknown value, so verifier won't skip analyzing path starting at line 4 and "dst->flags" requires dereferencing the pointer "dst" which actually hasn't be initialized for this path.
This patch relaxed the code marking sub-register move destination. For a SCALAR_VALUE, it is safe to just copy the value from source then truncate it into 32-bit.
A unit test also included to demonstrate this issue. This test will fail before this patch.
This relaxation could let verifier skipping more paths for conditional comparison against immediate. It also let verifier recording a more accurate/strict value for one register at one state, if this state end up with going through exit without rejection and it is used for state comparison later, then it is possible an inaccurate/permissive value is better. So the real impact on verifier processed insn number is complex. But in all, without this fix, valid program could be rejected.
From real benchmarking on kernel selftests and Cilium bpf tests, there is
no impact on processed instruction number when tests ares compiled with default compilation options. There is slightly improvements when they are compiled with -mattr=+alu32 after this patch.
Also, test_xdp_noinline/-mattr=+alu32 now passed verification. It is rejected before this fix.
Insn processed before/after this patch:
default -mattr=+alu32
Kernel selftest
=== test_xdp.o 371/371 369/369 test_l4lb.o 6345/6345 5623/5623 test_xdp_noinline.o 2971/2971 rejected/2727 test_tcp_estates.o 429/429 430/430
Cilium bpf === bpf_lb-DLB_L3.o: 2085/2085 1685/1687 bpf_lb-DLB_L4.o: 2287/2287 1986/1982 bpf_lb-DUNKNOWN.o: 690/690 622/622 bpf_lxc.o: 95033/95033 N/A bpf_netdev.o: 7245/7245 N/A bpf_overlay.o: 2898/2898 3085/2947
NOTE: - bpf_lxc.o and bpf_netdev.o compiled by -mattr=+alu32 are rejected by verifier due to another issue inside verifier on supporting alu32 binary. - Each cilium bpf program could generate several processed insn number, above number is sum of them.
v1->v2: - Restrict the change on SCALAR_VALUE. - Update benchmark numbers on Cilium bpf tests.
Signed-off-by: Jiong Wang jiong.wang@netronome.com Signed-off-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/bpf/verifier.c | 16 ++++++++++++---- tools/testing/selftests/bpf/test_verifier.c | 13 +++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89cea3ed535d..341806668f03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3285,12 +3285,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err;
if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = regs + insn->src_reg; + struct bpf_reg_state *dst_reg = regs + insn->dst_reg; + if (BPF_CLASS(insn->code) == BPF_ALU64) { /* case: R1 = R2 * copy register state to dest reg */ - regs[insn->dst_reg] = regs[insn->src_reg]; - regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -3298,9 +3301,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; + } else if (src_reg->type == SCALAR_VALUE) { + *dst_reg = *src_reg; + dst_reg->live |= REG_LIVE_WRITTEN; + } else { + mark_reg_unknown(env, regs, + insn->dst_reg); } - mark_reg_unknown(env, regs, insn->dst_reg); - coerce_reg_to_size(®s[insn->dst_reg], 4); + coerce_reg_to_size(dst_reg, 4); } } else { /* case: R = imm diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index e436b67f2426..9db5a7378f40 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -2748,6 +2748,19 @@ static struct bpf_test tests[] = { .result_unpriv = REJECT, .result = ACCEPT, }, + { + "alu32: mov u32 const", + .insns = { + BPF_MOV32_IMM(BPF_REG_7, 0), + BPF_ALU32_IMM(BPF_AND, BPF_REG_7, 1), + BPF_MOV32_REG(BPF_REG_0, BPF_REG_7), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, + }, { "unpriv: partial copy of pointer", .insns = {
From: Masahiro Yamada yamada.masahiro@socionext.com
[ Upstream commit 77c1c0fa8b1477c5799bdad65026ea5ff676da44 ]
Currently, warn_ignore_character() displays invalid file name and line number.
The lexer should use current_file->name and yylineno, while the parser should use zconf_curname() and zconf_lineno().
This difference comes from that the lexer is always going ahead of the parser. The parser needs to look ahead one token to make a shift/reduce decision, so the lexer is requested to scan more text from the input file.
This commit fixes the warning message from warn_ignored_character().
[Test Code]
----(Kconfig begin)---- / -----(Kconfig end)-----
[Output]
Before the fix:
<none>:0:warning: ignoring unsupported character '/'
After the fix:
Kconfig:1:warning: ignoring unsupported character '/'
Signed-off-by: Masahiro Yamada yamada.masahiro@socionext.com Signed-off-by: Sasha Levin sashal@kernel.org --- scripts/kconfig/zconf.l | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l index 25bd2b89fe3f..eeac64ccc730 100644 --- a/scripts/kconfig/zconf.l +++ b/scripts/kconfig/zconf.l @@ -73,7 +73,7 @@ static void warn_ignored_character(char chr) { fprintf(stderr, "%s:%d:warning: ignoring unsupported character '%c'\n", - zconf_curname(), zconf_lineno(), chr); + current_file->name, yylineno, chr); } %}
From: Masahiro Yamada yamada.masahiro@socionext.com
[ Upstream commit fbac5977d81cb2b2b7e37b11c459055d9585273c ]
An unterminated string literal followed by new line is passed to the parser (with "multi-line strings not supported" warning shown), then handled properly there.
On the other hand, an unterminated string literal at end of file is never passed to the parser, then results in memory leak.
[Test Code]
----------(Kconfig begin)---------- source "Kconfig.inc"
config A bool "a" -----------(Kconfig end)-----------
--------(Kconfig.inc begin)-------- config B bool "b\No new line at end of file ---------(Kconfig.inc end)---------
[Summary from Valgrind]
Before the fix:
LEAK SUMMARY: definitely lost: 16 bytes in 1 blocks ...
After the fix:
LEAK SUMMARY: definitely lost: 0 bytes in 0 blocks ...
Eliminate the memory leak path by handling this case. Of course, such a Kconfig file is wrong already, so I will add an error message later.
Signed-off-by: Masahiro Yamada yamada.masahiro@socionext.com Signed-off-by: Sasha Levin sashal@kernel.org --- scripts/kconfig/zconf.l | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/scripts/kconfig/zconf.l b/scripts/kconfig/zconf.l index eeac64ccc730..c2f577d71964 100644 --- a/scripts/kconfig/zconf.l +++ b/scripts/kconfig/zconf.l @@ -221,6 +221,8 @@ n [A-Za-z0-9_-] } <<EOF>> { BEGIN(INITIAL); + yylval.string = text; + return T_WORD_QUOTE; } }
From: Jonas Danielsson jonas@orbital-systems.com
[ Upstream commit ae460c115b7aa50c9a36cf78fced07b27962c9d0 ]
On our AT91SAM9260 board we use the same sdio bus for wifi and for the sd card slot. This caused the atmel-mci to give the following splat on the serial console:
------------[ cut here ]------------ WARNING: CPU: 0 PID: 538 at drivers/mmc/host/atmel-mci.c:859 atmci_send_command+0x24/0x44 Modules linked in: CPU: 0 PID: 538 Comm: mmcqd/0 Not tainted 4.14.76 #14 Hardware name: Atmel AT91SAM9 [<c000fccc>] (unwind_backtrace) from [<c000d3dc>] (show_stack+0x10/0x14) [<c000d3dc>] (show_stack) from [<c0017644>] (__warn+0xd8/0xf4) [<c0017644>] (__warn) from [<c0017704>] (warn_slowpath_null+0x1c/0x24) [<c0017704>] (warn_slowpath_null) from [<c033bb9c>] (atmci_send_command+0x24/0x44) [<c033bb9c>] (atmci_send_command) from [<c033e984>] (atmci_start_request+0x1f4/0x2dc) [<c033e984>] (atmci_start_request) from [<c033f3b4>] (atmci_request+0xf0/0x164) [<c033f3b4>] (atmci_request) from [<c0327108>] (mmc_start_request+0x280/0x2d0) [<c0327108>] (mmc_start_request) from [<c032800c>] (mmc_start_areq+0x230/0x330) [<c032800c>] (mmc_start_areq) from [<c03366f8>] (mmc_blk_issue_rw_rq+0xc4/0x310) [<c03366f8>] (mmc_blk_issue_rw_rq) from [<c03372c4>] (mmc_blk_issue_rq+0x118/0x5ac) [<c03372c4>] (mmc_blk_issue_rq) from [<c033781c>] (mmc_queue_thread+0xc4/0x118) [<c033781c>] (mmc_queue_thread) from [<c002daf8>] (kthread+0x100/0x118) [<c002daf8>] (kthread) from [<c000a580>] (ret_from_fork+0x14/0x34) ---[ end trace 594371ddfa284bd6 ]---
This is: WARN_ON(host->cmd);
This was fixed on our board by letting atmci_request_end determine what state we are in. Instead of unconditionally setting it to STATE_IDLE on STATE_END_REQUEST.
Signed-off-by: Jonas Danielsson jonas@orbital-systems.com Signed-off-by: Ulf Hansson ulf.hansson@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/mmc/host/atmel-mci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index be53044086c7..fbc56ee99682 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -1954,13 +1954,14 @@ static void atmci_tasklet_func(unsigned long priv) }
atmci_request_end(host, host->mrq); - state = STATE_IDLE; + goto unlock; /* atmci_request_end() sets host->state */ break; } } while (state != prev_state);
host->state = state;
+unlock: spin_unlock(&host->lock); }
From: Qu Wenruo wqu@suse.com
[ Upstream commit 5eb193812a42dc49331f25137a38dfef9612d3e4 ]
Enhance btrfs_verify_dev_extents() to remember previous checked dev extents, so it can verify no dev extents can overlap.
Analysis from Hans:
"Imagine allocating a DATA|DUP chunk.
In the chunk allocator, we first set... max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE ... which is 10GiB.
Then... /* we don't want a chunk larger than 10% of writeable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size);
Imagine we only have one 7880MiB block device in this filesystem. Now max_chunk_size is down to 788MiB.
The next step in the code is to search for max_stripe_size * dev_stripes amount of free space on the device, which is in our example 1GiB * 2 = 2GiB. Imagine the device has exactly 1578MiB free in one contiguous piece. This amount of bytes will be put in devices_info[ndevs - 1].max_avail
Next we recalculate the stripe_size (which is actually the device extent length), based on the actual maximum amount of available raw disk space: stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
stripe_size is now 789MiB
Next we do... data_stripes = num_stripes / ncopies ...where data_stripes ends up as 1, because num_stripes is 2 (the amount of device extents we're going to have), and DUP has ncopies 2.
Next there's a check... if (stripe_size * data_stripes > max_chunk_size) ...which matches because 789MiB * 1 > 788MiB.
We go into the if code, and next is... stripe_size = div_u64(max_chunk_size, data_stripes); ...which resets stripe_size to max_chunk_size: 788MiB
Next is a fun one... /* bump the answer up to a 16MB boundary */ stripe_size = round_up(stripe_size, SZ_16M); ...which changes stripe_size from 788MiB to 800MiB.
We're not done changing stripe_size yet... /* But don't go higher than the limits we found while searching * for free extents */ stripe_size = min(devices_info[ndevs - 1].max_avail, stripe_size);
This is bad. max_avail is twice the stripe_size (we need to fit 2 device extents on the same device for DUP).
The result here is that 800MiB < 1578MiB, so it's unchanged. However, the resulting DUP chunk will need 1600MiB disk space, which isn't there, and the second dev_extent might extend into the next thing (next dev_extent? end of device?) for 22MiB.
The last shown line of code relies on a situation where there's twice the value of stripe_size present as value for the variable stripe_size when it's DUP. This was actually the case before commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling", from which I quote: "[...] in the meantime there's a check to see if the stripe_size does not exceed max_chunk_size. Since during this check stripe_size is twice the amount as intended, the check will reduce the stripe_size to max_chunk_size if the actual correct to be used stripe_size is more than half the amount of max_chunk_size."
In the previous version of the code, the 16MiB alignment (why is this done, by the way?) would result in a 50% chance that it would actually do an 8MiB alignment for the individual dev_extents, since it was operating on double the size. Does this matter?
Does it matter that stripe_size can be set to anything which is not 16MiB aligned because of the amount of remaining available disk space which is just taken?
What is the main purpose of this round_up?
The most straightforward thing to do seems something like... stripe_size = min( div_u64(devices_info[ndevs - 1].max_avail, dev_stripes), stripe_size ) ..just putting half of the max_avail into stripe_size."
Link: https://lore.kernel.org/linux-btrfs/b3461a38-e5f8-f41d-c67c-2efac8129054@men... Reported-by: Hans van Kranenburg hans.van.kranenburg@mendix.com Signed-off-by: Qu Wenruo wqu@suse.com [ add analysis from report ] Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4405e430da6..5aa6296699b4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7467,6 +7467,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; int ret = 0;
key.objectid = 1; @@ -7511,10 +7513,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext);
+ /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + ret = btrfs_next_item(root, path); if (ret < 0) goto out;
From: Hans van Kranenburg hans.van.kranenburg@mendix.com
[ Upstream commit baf92114c7e6dd6124aa3d506e4bc4b694da3bc3 ]
Commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling" fixed calculating the stripe_size for a new DUP chunk.
However, the same calculation reappears a bit later, and that one was not changed yet. The resulting bug that is exposed is that the newly allocated device extents ('stripes') can have a few MiB overlap with the next thing stored after them, which is another device extent or the end of the disk.
The scenario in which this can happen is: * The block device for the filesystem is less than 10GiB in size. * The amount of contiguous free unallocated disk space chosen to use for chunk allocation is 20% of the total device size, or a few MiB more or less.
An example: - The filesystem device is 7880MiB (max_chunk_size gets set to 788MiB) - There's 1578MiB unallocated raw disk space left in one contiguous piece.
In this case stripe_size is first calculated as 789MiB, (half of 1578MiB).
Since 789MiB (stripe_size * data_stripes) > 788MiB (max_chunk_size), we enter the if block. Now stripe_size value is immediately overwritten while calculating an adjusted value based on max_chunk_size, which ends up as 788MiB.
Next, the value is rounded up to a 16MiB boundary, 800MiB, which is actually more than the value we had before. However, the last comparison fails to detect this, because it's comparing the value with the total amount of free space, which is about twice the size of stripe_size.
In the example above, this means that the resulting raw disk space being allocated is 1600MiB, while only a gap of 1578MiB has been found. The second device extent object for this DUP chunk will overlap for 22MiB with whatever comes next.
The underlying problem here is that the stripe_size is reused all the time for different things. So, when entering the code in the if block, stripe_size is immediately overwritten with something else. If later we decide we want to have the previous value back, then the logic to compute it was copy pasted in again.
With this change, the value in stripe_size is not unnecessarily destroyed, so the duplicated calculation is not needed any more.
Signed-off-by: Hans van Kranenburg hans.van.kranenburg@mendix.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/volumes.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5aa6296699b4..064e86c069ec 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4761,19 +4761,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, /* * Use the number of data stripes to figure out how big this chunk * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size + * and compare that answer with the max chunk size. If it's higher, + * we try to reduce stripe_size. */ if (stripe_size * data_stripes > max_chunk_size) { - stripe_size = div_u64(max_chunk_size, data_stripes); - - /* bump the answer up to a 16MB boundary */ - stripe_size = round_up(stripe_size, SZ_16M); - /* - * But don't go higher than the limits we found while searching - * for free extents + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. */ - stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size = min(round_up(div_u64(max_chunk_size, + data_stripes), SZ_16M), stripe_size); }
From: Anand Jain anand.jain@oracle.com
[ Upstream commit d189dd70e2556181732598956d808ea53cc8774e ]
The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread.
The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread.
scrub_setup_ctx() warns as tgtdev is NULL.
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; }
[ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe
Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics:
static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <======
[ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50
Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status.
Signed-off-by: Anand Jain anand.jain@oracle.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/dev-replace.c | 63 +++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 22 deletions(-)
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 981434764bb9..c26579d3f7b6 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -800,39 +800,58 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + /* btrfs_dev_replace_finishing() will handle the cleanup part */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = ktime_get_real_seconds(); - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1;
- trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); + btrfs_dev_replace_write_unlock(dev_replace);
- btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret);
- if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + result = -EINVAL; + }
-leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return result; }
From: Filipe Manana fdmanana@suse.com
[ Upstream commit 9a6f209e36500efac51528132a3e3083586eda5f ]
If the quota enable and snapshot creation ioctls are called concurrently we can get into a deadlock where the task enabling quotas will deadlock on the fs_info->qgroup_ioctl_lock mutex because it attempts to lock it twice, or the task creating a snapshot tries to commit the transaction while the task enabling quota waits for the former task to commit the transaction while holding the mutex. The following time diagrams show how both cases happen.
First scenario:
CPU 0 CPU 1
btrfs_ioctl() btrfs_ioctl_quota_ctl() btrfs_quota_enable() mutex_lock(fs_info->qgroup_ioctl_lock) btrfs_start_transaction()
btrfs_ioctl() btrfs_ioctl_snap_create_v2 create_snapshot() --> adds snapshot to the list pending_snapshots of the current transaction
btrfs_commit_transaction() create_pending_snapshots() create_pending_snapshot() qgroup_account_snapshot() btrfs_qgroup_inherit() mutex_lock(fs_info->qgroup_ioctl_lock) --> deadlock, mutex already locked by this task at btrfs_quota_enable()
Second scenario:
CPU 0 CPU 1
btrfs_ioctl() btrfs_ioctl_quota_ctl() btrfs_quota_enable() mutex_lock(fs_info->qgroup_ioctl_lock) btrfs_start_transaction()
btrfs_ioctl() btrfs_ioctl_snap_create_v2 create_snapshot() --> adds snapshot to the list pending_snapshots of the current transaction
btrfs_commit_transaction() --> waits for task at CPU 0 to release its transaction handle
btrfs_commit_transaction() --> sees another task started the transaction commit first --> releases its transaction handle --> waits for the transaction commit to be completed by the task at CPU 1
create_pending_snapshot() qgroup_account_snapshot() btrfs_qgroup_inherit() mutex_lock(fs_info->qgroup_ioctl_lock) --> deadlock, task at CPU 0 has the mutex locked but it is waiting for us to finish the transaction commit
So fix this by setting the quota enabled flag in fs_info after committing the transaction at btrfs_quota_enable(). This ends up serializing quota enable and snapshot creation as if the snapshot creation happened just before the quota enable request. The quota rescan task, scheduled after committing the transaction in btrfs_quote_enable(), will do the accounting.
Fixes: 6426c7ad697d ("btrfs: qgroup: Fix qgroup accounting when creating snapshot") Signed-off-by: Filipe Manana fdmanana@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/qgroup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ff434663d65b..e1fcb28ad4cc 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1013,16 +1013,22 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_abort_transaction(trans, ret); goto out_free_path; } - spin_lock(&fs_info->qgroup_lock); - fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - spin_unlock(&fs_info->qgroup_lock);
ret = btrfs_commit_transaction(trans); trans = NULL; if (ret) goto out_free_path;
+ /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info);
From: Filipe Manana fdmanana@suse.com
[ Upstream commit 5a8067c0d17feb7579db0476191417b441a8996e ]
The available allocation bits members from struct btrfs_fs_info are protected by a sequence lock, and when starting balance we access them incorrectly in two different ways:
1) In the read sequence lock loop at btrfs_balance() we use the values we read from fs_info->avail_*_alloc_bits and we can immediately do actions that have side effects and can not be undone (printing a message and jumping to a label). This is wrong because a retry might be needed, so our actions must not have side effects and must be repeatable as long as read_seqretry() returns a non-zero value. In other words, we were essentially ignoring the sequence lock;
2) Right below the read sequence lock loop, we were reading the values from avail_metadata_alloc_bits and avail_data_alloc_bits without any protection from concurrent writers, that is, reading them outside of the read sequence lock critical section.
So fix this by making sure we only read the available allocation bits while in a read sequence lock critical section and that what we do in the critical section is repeatable (has nothing that can not be undone) so that any eventual retry that is needed is handled properly.
Fixes: de98ced9e743 ("Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits") Fixes: 14506127979a ("btrfs: fix a bogus warning when converting only data or metadata") Reviewed-by: Nikolay Borisov nborisov@suse.com Signed-off-by: Filipe Manana fdmanana@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/volumes.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 064e86c069ec..0ee1cd4b56fb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3712,6 +3712,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; + bool reducing_integrity;
if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3796,24 +3797,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - btrfs_info(fs_info, - "balance: force reducing metadata integrity"); - } else { - btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); - ret = -EINVAL; - goto out; - } - } + !(bctl->meta.target & allowed))) + reducing_integrity = true; + else + reducing_integrity = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq));
- /* if we're not converting, the target field is uninitialized */ - meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->meta.target : fs_info->avail_metadata_alloc_bits; - data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->data.target : fs_info->avail_data_alloc_bits; + if (reducing_integrity) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata integrity"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata integrity, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
From: Johannes Thumshirn jthumshirn@suse.de
[ Upstream commit 1690dd41e0cb1dade80850ed8a3eb0121b96d22f ]
In the error handling block, err holds the return value of either btrfs_del_root_ref() or btrfs_del_inode_ref() but it hasn't been checked since it's introduction with commit fe66a05a0679 (Btrfs: improve error handling for btrfs_insert_dir_item callers) in 2012.
If the error handling in the error handling fails, there's not much left to do and the abort either happened earlier in the callees or is necessary here.
So if one of btrfs_del_root_ref() or btrfs_del_inode_ref() failed, abort the transaction, but still return the original code of the failure stored in 'ret' as this will be reported to the user.
Signed-off-by: Johannes Thumshirn jthumshirn@suse.de Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7158b5b77c9d..1c13b21ea13e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6427,14 +6427,19 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); - + if (err) + btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err;
err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); + if (err) + btrfs_abort_transaction(trans, err); } + + /* Return the original error code */ return ret; }
From: Sergey Senozhatsky sergey.senozhatsky.work@gmail.com
[ Upstream commit d72402145ace0697a6a9e8e75a3de5bf3375f78d ]
LKP has hit yet another circular locking dependency between uart console drivers and debugobjects [1]:
CPU0 CPU1
rhltable_init() __init_work() debug_object_init uart_shutdown() /* db->lock */ /* uart_port->lock */ debug_print_object() free_page() printk() call_console_drivers() debug_check_no_obj_freed() /* uart_port->lock */ /* db->lock */ debug_print_object()
So there are two dependency chains: uart_port->lock -> db->lock And db->lock -> uart_port->lock
This particular circular locking dependency can be addressed in several ways:
a) One way would be to move debug_print_object() out of db->lock scope and, thus, break the db->lock -> uart_port->lock chain. b) Another one would be to free() transmit buffer page out of db->lock in UART code; which is what this patch does.
It makes sense to apply a) and b) independently: there are too many things going on behind free(), none of which depend on uart_port->lock.
The patch fixes transmit buffer page free() in uart_shutdown() and, additionally, in uart_port_startup() (as was suggested by Dmitry Safonov).
[1] https://lore.kernel.org/lkml/20181211091154.GL23332@shao2-debian/T/#u Signed-off-by: Sergey Senozhatsky sergey.senozhatsky@gmail.com Reviewed-by: Petr Mladek pmladek@suse.com Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Jiri Slaby jslaby@suse.com Cc: Andrew Morton akpm@linux-foundation.org Cc: Waiman Long longman@redhat.com Cc: Dmitry Safonov dima@arista.com Cc: Steven Rostedt rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/tty/serial/serial_core.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 80bb56facfb6..ad126f51d549 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -205,10 +205,15 @@ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state, if (!state->xmit.buf) { state->xmit.buf = (unsigned char *) page; uart_circ_clear(&state->xmit); + uart_port_unlock(uport, flags); } else { + uart_port_unlock(uport, flags); + /* + * Do not free() the page under the port lock, see + * uart_shutdown(). + */ free_page(page); } - uart_port_unlock(uport, flags);
retval = uport->ops->startup(uport); if (retval == 0) { @@ -268,6 +273,7 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state) struct uart_port *uport = uart_port_check(state); struct tty_port *port = &state->port; unsigned long flags = 0; + char *xmit_buf = NULL;
/* * Set the TTY IO error marker @@ -298,14 +304,18 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state) tty_port_set_suspended(port, 0);
/* - * Free the transmit buffer page. + * Do not free() the transmit buffer page under the port lock since + * this can create various circular locking scenarios. For instance, + * console driver may need to allocate/free a debug object, which + * can endup in printk() recursion. */ uart_port_lock(state, flags); - if (state->xmit.buf) { - free_page((unsigned long)state->xmit.buf); - state->xmit.buf = NULL; - } + xmit_buf = state->xmit.buf; + state->xmit.buf = NULL; uart_port_unlock(uport, flags); + + if (xmit_buf) + free_page((unsigned long)xmit_buf); }
/**
From: Adrian Hunter adrian.hunter@intel.com
[ Upstream commit 1c6f709b9f96366cc47af23c05ecec9b8c0c392d ]
Users should never use 'pt=0', but if they do it may give a meaningless error:
$ perf record -e intel_pt/pt=0/u uname Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (intel_pt/pt=0/u).
Fix that by forcing 'pt=1'.
Committer testing:
# perf record -e intel_pt/pt=0/u uname Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (intel_pt/pt=0/u). /bin/dmesg | grep -i perf may provide additional information.
# perf record -e intel_pt/pt=0/u uname pt=0 doesn't make sense, forcing pt=1 Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.020 MB perf.data ] #
Signed-off-by: Adrian Hunter adrian.hunter@intel.com Tested-by: Arnaldo Carvalho de Melo acme@redhat.com Cc: Jiri Olsa jolsa@redhat.com Link: http://lkml.kernel.org/r/b7c5b4e5-9497-10e5-fd43-5f3e4a0fe51d@intel.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/arch/x86/util/intel-pt.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c index db0ba8caf5a2..ba8ecaf52200 100644 --- a/tools/perf/arch/x86/util/intel-pt.c +++ b/tools/perf/arch/x86/util/intel-pt.c @@ -524,10 +524,21 @@ static int intel_pt_validate_config(struct perf_pmu *intel_pt_pmu, struct perf_evsel *evsel) { int err; + char c;
if (!evsel) return 0;
+ /* + * If supported, force pass-through config term (pt=1) even if user + * sets pt=0, which avoids senseless kernel errors. + */ + if (perf_pmu__scan_file(intel_pt_pmu, "format/pt", "%c", &c) == 1 && + !(evsel->attr.config & 1)) { + pr_warning("pt=0 doesn't make sense, forcing pt=1\n"); + evsel->attr.config |= 1; + } + err = intel_pt_val_config_term(intel_pt_pmu, "caps/cycle_thresholds", "cyc_thresh", "caps/psb_cyc", evsel->attr.config);
From: Florian Fainelli f.fainelli@gmail.com
[ Upstream commit 24f967337f6d6bce931425769c0f5ff5cf2d212e ]
The breakpoint tests on the ARM 32-bit kernel are broken in several ways.
The breakpoint length requested does not necessarily match whether the function address has the Thumb bit (bit 0) set or not, and this does matter to the ARM kernel hw_breakpoint infrastructure. See [1] for background.
[1]: https://lkml.org/lkml/2018/11/15/205
As Will indicated, the overflow handling would require single-stepping which is not supported at the moment. Just disable those tests for the ARM 32-bit platforms and update the comment above to explain these limitations.
Co-developed-by: Will Deacon will.deacon@arm.com Signed-off-by: Florian Fainelli f.fainelli@gmail.com Signed-off-by: Will Deacon will.deacon@arm.com Acked-by: Jiri Olsa jolsa@redhat.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Link: http://lkml.kernel.org/r/20181203191138.2419-1-f.fainelli@gmail.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/tests/bp_signal.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c index a467615c5a0e..910e25e64188 100644 --- a/tools/perf/tests/bp_signal.c +++ b/tools/perf/tests/bp_signal.c @@ -291,12 +291,20 @@ int test__bp_signal(struct test *test __maybe_unused, int subtest __maybe_unused
bool test__bp_signal_is_supported(void) { -/* - * The powerpc so far does not have support to even create - * instruction breakpoint using the perf event interface. - * Once it's there we can release this. - */ -#if defined(__powerpc__) || defined(__s390x__) + /* + * PowerPC and S390 do not support creation of instruction + * breakpoints using the perf_event interface. + * + * ARM requires explicit rounding down of the instruction + * pointer in Thumb mode, and then requires the single-step + * to be handled explicitly in the overflow handler to avoid + * stepping into the SIGIO handler and getting stuck on the + * breakpointed instruction. + * + * Just disable the test for these architectures until these + * issues are resolved. + */ +#if defined(__powerpc__) || defined(__s390x__) || defined(__arm__) return false; #else return true;
From: Arnaldo Carvalho de Melo acme@redhat.com
[ Upstream commit 2f5302533f306d5ee87bd375aef9ca35b91762cb ]
The strncpy() function may leave the destination string buffer unterminated, better use strlcpy() that we have a __weak fallback implementation for systems without it.
In this specific case this would only happen if fgets() was buggy, as its man page states that it should read one less byte than the size of the destination buffer, so that it can put the nul byte at the end of it, so it would never copy 255 non-nul chars, as fgets reads into the orig buffer at most 254 non-nul chars and terminates it. But lets just switch to strlcpy to keep the original intent and silence the gcc 8.2 warning.
This fixes this warning on an Alpine Linux Edge system with gcc 8.2:
In function 'cpu_model', inlined from 'svg_cpu_box' at util/svghelper.c:378:2: util/svghelper.c:337:5: error: 'strncpy' output may be truncated copying 255 bytes from a string of length 255 [-Werror=stringop-truncation] strncpy(cpu_m, &buf[13], 255); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Cc: Adrian Hunter adrian.hunter@intel.com Cc: Jiri Olsa jolsa@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Arjan van de Ven arjan@linux.intel.com Fixes: f48d55ce7871 ("perf: Add a SVG helper library file") Link: https://lkml.kernel.org/n/tip-xzkoo0gyr56gej39ltivuh9g@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/util/svghelper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 1cbada2dc6be..f735ee038713 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -334,7 +334,7 @@ static char *cpu_model(void) if (file) { while (fgets(buf, 255, file)) { if (strstr(buf, "model name")) { - strncpy(cpu_m, &buf[13], 255); + strlcpy(cpu_m, &buf[13], 255); break; } }
From: Arnaldo Carvalho de Melo acme@redhat.com
[ Upstream commit bd8d57fb7e25e9fcf67a9eef5fa13aabe2016e07 ]
The strncpy() function may leave the destination string buffer unterminated, better use strlcpy() that we have a __weak fallback implementation for systems without it.
This fixes this warning on an Alpine Linux Edge system with gcc 8.2:
util/parse-events.c: In function 'print_symbol_events': util/parse-events.c:2465:4: error: 'strncpy' specified bound 100 equals destination size [-Werror=stringop-truncation] strncpy(name, syms->symbol, MAX_NAME_LEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'print_symbol_events.constprop', inlined from 'print_events' at util/parse-events.c:2508:2: util/parse-events.c:2465:4: error: 'strncpy' specified bound 100 equals destination size [-Werror=stringop-truncation] strncpy(name, syms->symbol, MAX_NAME_LEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'print_symbol_events.constprop', inlined from 'print_events' at util/parse-events.c:2511:2: util/parse-events.c:2465:4: error: 'strncpy' specified bound 100 equals destination size [-Werror=stringop-truncation] strncpy(name, syms->symbol, MAX_NAME_LEN); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors
Cc: Adrian Hunter adrian.hunter@intel.com Cc: Jiri Olsa jolsa@kernel.org Cc: Namhyung Kim namhyung@kernel.org Fixes: 947b4ad1d198 ("perf list: Fix max event string size") Link: https://lkml.kernel.org/n/tip-b663e33bm6x8hrkie4uxh7u2@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/util/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f8cd3e7c9186..ebb18a9bc460 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2454,7 +2454,7 @@ void print_symbol_events(const char *event_glob, unsigned type, if (!name_only && strlen(syms->alias)) snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias); else - strncpy(name, syms->symbol, MAX_NAME_LEN); + strlcpy(name, syms->symbol, MAX_NAME_LEN);
evt_list[evt_i] = strdup(name); if (evt_list[evt_i] == NULL)
From: Andi Kleen ak@linux.intel.com
[ Upstream commit 91b2b97025097ce7ca7536bc87eba2bf14760fb4 ]
Fix incorrect event names for the Load_Miss_Real_Latency metric for Skylake and Skylake Server.
Fixes https://github.com/andikleen/pmu-tools/issues/158
Before:
% perf stat -M Load_Miss_Real_Latency true event syntax error: '..ss.pending,mem_load_retired.l1_miss_ps,mem_load_retired.fb_hit_ps}:W' ___ parser error
Usage: perf stat [<options>] [<command>]
-M, --metrics <metric/metric group list> monitor specified metrics or metric groups (separated by ,)
After:
% perf stat -M Load_Miss_Real_Latency true
Performance counter stats for 'true':
279,204 l1d_pend_miss.pending # 14.0 Load_Miss_Real_Latency 4,784 mem_load_uops_retired.l1_miss 15,188 mem_load_uops_retired.hit_lfb
0.000899640 seconds time elapsed
Signed-off-by: Andi Kleen ak@linux.intel.com Acked-by: Jiri Olsa jolsa@kernel.org Tested-by: Arnaldo Carvalho de Melo acme@redhat.com Link: http://lkml.kernel.org/r/20181120050635.4215-1-andi@firstfloor.org Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json | 2 +- tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index 36c903faed0b..71e9737f4614 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -73,7 +73,7 @@ }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS_PS + MEM_LOAD_RETIRED.FB_HIT_PS )", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", "MetricGroup": "Memory_Bound;Memory_Lat", "MetricName": "Load_Miss_Real_Latency" }, diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 36c903faed0b..71e9737f4614 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -73,7 +73,7 @@ }, { "BriefDescription": "Actual Average Latency for L1 data-cache miss demand loads", - "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS_PS + MEM_LOAD_RETIRED.FB_HIT_PS )", + "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_RETIRED.L1_MISS + MEM_LOAD_RETIRED.FB_HIT )", "MetricGroup": "Memory_Bound;Memory_Lat", "MetricName": "Load_Miss_Real_Latency" },
From: Taehee Yoo ap420073@gmail.com
[ Upstream commit 06aa151ad1fc74a49b45336672515774a678d78d ]
If same destination IP address config is already existing, that config is just used. MAC address also should be same. However, there is no MAC address checking routine. So that MAC address checking routine is added.
test commands: %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:21 --total-nodes 2 --local-node 1
After this patch, above commands are disallowed.
Signed-off-by: Taehee Yoo ap420073@gmail.com Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2c8d313ae216..e40e6795bd20 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -496,7 +496,8 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (IS_ERR(config)) return PTR_ERR(config); } - } + } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) + return -EINVAL;
ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) {
From: Taehee Yoo ap420073@gmail.com
[ Upstream commit b12f7bad5ad3724d19754390a3e80928525c0769 ]
When network namespace is destroyed, both clusterip_tg_destroy() and clusterip_net_exit() are called. and clusterip_net_exit() is called before clusterip_tg_destroy(). Hence cleanup check code in clusterip_net_exit() doesn't make sense.
test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1
splat looks like: [ 341.184508] WARNING: CPU: 1 PID: 87 at net/ipv4/netfilter/ipt_CLUSTERIP.c:840 clusterip_net_exit+0x319/0x380 [ipt_CLUSTERIP] [ 341.184850] Modules linked in: ipt_CLUSTERIP nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 xt_tcpudp iptable_filter bpfilter ip_tables x_tables [ 341.184850] CPU: 1 PID: 87 Comm: kworker/u4:2 Not tainted 4.19.0-rc5+ #16 [ 341.227509] Workqueue: netns cleanup_net [ 341.227509] RIP: 0010:clusterip_net_exit+0x319/0x380 [ipt_CLUSTERIP] [ 341.227509] Code: 0f 85 7f fe ff ff 48 c7 c2 80 64 2c c0 be a8 02 00 00 48 c7 c7 a0 63 2c c0 c6 05 18 6e 00 00 01 e8 bc 38 ff f5 e9 5b fe ff ff <0f> 0b e9 33 ff ff ff e8 4b 90 50 f6 e9 2d fe ff ff 48 89 df e8 de [ 341.227509] RSP: 0018:ffff88011086f408 EFLAGS: 00010202 [ 341.227509] RAX: dffffc0000000000 RBX: 1ffff1002210de85 RCX: 0000000000000000 [ 341.227509] RDX: 1ffff1002210de85 RSI: ffff880110813be8 RDI: ffffed002210de58 [ 341.227509] RBP: ffff88011086f4d0 R08: 0000000000000000 R09: 0000000000000000 [ 341.227509] R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff1002210de81 [ 341.227509] R13: ffff880110625a48 R14: ffff880114cec8c8 R15: 0000000000000014 [ 341.227509] FS: 0000000000000000(0000) GS:ffff880116600000(0000) knlGS:0000000000000000 [ 341.227509] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 341.227509] CR2: 00007f11fd38e000 CR3: 000000013ca16000 CR4: 00000000001006e0 [ 341.227509] Call Trace: [ 341.227509] ? __clusterip_config_find+0x460/0x460 [ipt_CLUSTERIP] [ 341.227509] ? default_device_exit+0x1ca/0x270 [ 341.227509] ? remove_proc_entry+0x1cd/0x390 [ 341.227509] ? dev_change_net_namespace+0xd00/0xd00 [ 341.227509] ? __init_waitqueue_head+0x130/0x130 [ 341.227509] ops_exit_list.isra.10+0x94/0x140 [ 341.227509] cleanup_net+0x45b/0x900 [ ... ]
Fixes: 613d0776d3fe ("netfilter: exit_net cleanup check added") Signed-off-by: Taehee Yoo ap420073@gmail.com Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e40e6795bd20..33491cb6b9d1 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -838,7 +838,6 @@ static void clusterip_net_exit(struct net *net) cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); - WARN_ON_ONCE(!list_empty(&cn->configs)); }
static struct pernet_operations clusterip_net_ops = {
From: Taehee Yoo ap420073@gmail.com
[ Upstream commit 5a86d68bcf02f2d1e9a5897dd482079fd5f75e7f ]
When network namespace is destroyed, cleanup_net() is called. cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback. So that clusterip_tg_destroy() is called by cleanup_net(). And clusterip_tg_destroy() calls unregister_netdevice_notifier().
But both cleanup_net() and clusterip_tg_destroy() hold same lock(pernet_ops_rwsem). hence deadlock occurrs.
After this patch, only 1 notifier is registered when module is inserted. And all of configs are added to per-net list.
test commands: %ip netns add vm1 %ip netns exec vm1 bash %ip link set lo up %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \ -j CLUSTERIP --new --hashmode sourceip \ --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1 %exit %ip netns del vm1
splat looks like: [ 341.809674] ============================================ [ 341.809674] WARNING: possible recursive locking detected [ 341.809674] 4.19.0-rc5+ #16 Tainted: G W [ 341.809674] -------------------------------------------- [ 341.809674] kworker/u4:2/87 is trying to acquire lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: unregister_netdevice_notifier+0x8c/0x460 [ 341.809674] [ 341.809674] but task is already holding lock: [ 341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] other info that might help us debug this: [ 341.809674] Possible unsafe locking scenario: [ 341.809674] [ 341.809674] CPU0 [ 341.809674] ---- [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] lock(pernet_ops_rwsem); [ 341.809674] [ 341.809674] *** DEADLOCK *** [ 341.809674] [ 341.809674] May be due to missing lock nesting notation [ 341.809674] [ 341.809674] 3 locks held by kworker/u4:2/87: [ 341.809674] #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: process_one_work+0xafe/0x1de0 [ 341.809674] #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: process_one_work+0xb60/0x1de0 [ 341.809674] #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: cleanup_net+0x119/0x900 [ 341.809674] [ 341.809674] stack backtrace: [ 341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G W 4.19.0-rc5+ #16 [ 341.809674] Workqueue: netns cleanup_net [ 341.809674] Call Trace: [ ... ] [ 342.070196] down_write+0x93/0x160 [ 342.070196] ? unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? down_read+0x1e0/0x1e0 [ 342.070196] ? sched_clock_cpu+0x126/0x170 [ 342.070196] ? find_held_lock+0x39/0x1c0 [ 342.070196] unregister_netdevice_notifier+0x8c/0x460 [ 342.070196] ? register_netdevice_notifier+0x790/0x790 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? __local_bh_enable_ip+0xe9/0x1b0 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.070196] ? trace_hardirqs_on+0x93/0x210 [ 342.070196] ? __bpf_trace_preemptirq_template+0x10/0x10 [ 342.070196] ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP] [ 342.123094] clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP] [ 342.123094] ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP] [ 342.123094] ? cleanup_match+0x17d/0x200 [ip_tables] [ 342.123094] ? xt_unregister_table+0x215/0x300 [x_tables] [ 342.123094] ? kfree+0xe2/0x2a0 [ 342.123094] cleanup_entry+0x1d5/0x2f0 [ip_tables] [ 342.123094] ? cleanup_match+0x200/0x200 [ip_tables] [ 342.123094] __ipt_unregister_table+0x9b/0x1a0 [ip_tables] [ 342.123094] iptable_filter_net_exit+0x43/0x80 [iptable_filter] [ 342.123094] ops_exit_list.isra.10+0x94/0x140 [ 342.123094] cleanup_net+0x45b/0x900 [ ... ]
Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev") Signed-off-by: Taehee Yoo ap420073@gmail.com Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 155 ++++++++++++++++------------- 1 file changed, 87 insertions(+), 68 deletions(-)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 33491cb6b9d1..fb1e7f237f53 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -57,17 +57,14 @@ struct clusterip_config { enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ struct rcu_head rcu; - + struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ - struct notifier_block notifier; /* refresh c->ifindex in it */ };
#ifdef CONFIG_PROC_FS static const struct file_operations clusterip_proc_fops; #endif
-static unsigned int clusterip_net_id __read_mostly; - struct clusterip_net { struct list_head configs; /* lock protects the configs list */ @@ -78,16 +75,30 @@ struct clusterip_net { #endif };
+static unsigned int clusterip_net_id __read_mostly; +static inline struct clusterip_net *clusterip_pernet(struct net *net) +{ + return net_generic(net, clusterip_net_id); +} + static inline void clusterip_config_get(struct clusterip_config *c) { refcount_inc(&c->refcount); }
- static void clusterip_config_rcu_free(struct rcu_head *head) { - kfree(container_of(head, struct clusterip_config, rcu)); + struct clusterip_config *config; + struct net_device *dev; + + config = container_of(head, struct clusterip_config, rcu); + dev = dev_get_by_name(config->net, config->ifname); + if (dev) { + dev_mc_del(dev, config->clustermac); + dev_put(dev); + } + kfree(config); }
static inline void @@ -101,9 +112,9 @@ clusterip_config_put(struct clusterip_config *c) * entry(rule) is removed, remove the config from lists, but don't free it * yet, since proc-files could still be holding references */ static inline void -clusterip_config_entry_put(struct net *net, struct clusterip_config *c) +clusterip_config_entry_put(struct clusterip_config *c) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(c->net);
local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { @@ -118,8 +129,6 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c) spin_unlock(&cn->lock); local_bh_enable();
- unregister_netdevice_notifier(&c->notifier); - return; } local_bh_enable(); @@ -129,7 +138,7 @@ static struct clusterip_config * __clusterip_config_find(struct net *net, __be32 clusterip) { struct clusterip_config *c; - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net);
list_for_each_entry_rcu(c, &cn->configs, list) { if (c->clusterip == clusterip) @@ -181,32 +190,37 @@ clusterip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); + struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c;
- c = container_of(this, struct clusterip_config, notifier); - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; - } - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, c->ifname)) { - c->ifindex = dev->ifindex; - dev_mc_add(dev, c->clustermac); - } else if (dev->ifindex == c->ifindex) { - dev_mc_del(dev, c->clustermac); - c->ifindex = -1; + spin_lock_bh(&cn->lock); + list_for_each_entry_rcu(c, &cn->configs, list) { + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, c->ifname)) { + c->ifindex = dev->ifindex; + dev_mc_add(dev, c->clustermac); + } else if (dev->ifindex == c->ifindex) { + dev_mc_del(dev, c->clustermac); + c->ifindex = -1; + } + break; } - break; } + spin_unlock_bh(&cn->lock);
return NOTIFY_DONE; } @@ -215,30 +229,44 @@ static struct clusterip_config * clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, __be32 ip, const char *iniface) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; + struct net_device *dev; int err;
+ if (iniface[0] == '\0') { + pr_info("Please specify an interface name\n"); + return ERR_PTR(-EINVAL); + } + c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return ERR_PTR(-ENOMEM);
- strcpy(c->ifname, iniface); - c->ifindex = -1; - c->clusterip = ip; + dev = dev_get_by_name(net, iniface); + if (!dev) { + pr_info("no such interface %s\n", iniface); + kfree(c); + return ERR_PTR(-ENOENT); + } + c->ifindex = dev->ifindex; + strcpy(c->ifname, dev->name); memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); + dev_mc_add(dev, c->clustermac); + dev_put(dev); + + c->clusterip = ip; c->num_total_nodes = i->num_total_nodes; clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; + c->net = net; refcount_set(&c->refcount, 1);
spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { - spin_unlock_bh(&cn->lock); - kfree(c); - - return ERR_PTR(-EBUSY); + err = -EBUSY; + goto out_config_put; }
list_add_rcu(&c->list, &cn->configs); @@ -260,22 +288,17 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, } #endif
- c->notifier.notifier_call = clusterip_netdev_event; - err = register_netdevice_notifier(&c->notifier); - if (!err) { - refcount_set(&c->entries, 1); - return c; - } + refcount_set(&c->entries, 1); + return c;
#ifdef CONFIG_PROC_FS - proc_remove(c->pde); err: #endif spin_lock_bh(&cn->lock); list_del_rcu(&c->list); +out_config_put: spin_unlock_bh(&cn->lock); clusterip_config_put(c); - return ERR_PTR(err); }
@@ -475,21 +498,6 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) &e->ip.dst.s_addr); return -EINVAL; } else { - struct net_device *dev; - - if (e->ip.iniface[0] == '\0') { - pr_info("Please specify an interface name\n"); - return -EINVAL; - } - - dev = dev_get_by_name(par->net, e->ip.iniface); - if (!dev) { - pr_info("no such interface %s\n", - e->ip.iniface); - return -ENOENT; - } - dev_put(dev); - config = clusterip_config_init(par->net, cipinfo, e->ip.dst.s_addr, e->ip.iniface); @@ -503,7 +511,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); - clusterip_config_entry_put(par->net, config); + clusterip_config_entry_put(config); clusterip_config_put(config); return ret; } @@ -525,7 +533,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
/* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ - clusterip_config_entry_put(par->net, cipinfo->config); + clusterip_config_entry_put(cipinfo->config);
clusterip_config_put(cipinfo->config);
@@ -807,7 +815,7 @@ static const struct file_operations clusterip_proc_fops = {
static int clusterip_net_init(struct net *net) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); int ret;
INIT_LIST_HEAD(&cn->configs); @@ -832,7 +840,7 @@ static int clusterip_net_init(struct net *net)
static void clusterip_net_exit(struct net *net) { - struct clusterip_net *cn = net_generic(net, clusterip_net_id); + struct clusterip_net *cn = clusterip_pernet(net); #ifdef CONFIG_PROC_FS proc_remove(cn->procdir); cn->procdir = NULL; @@ -847,6 +855,10 @@ static struct pernet_operations clusterip_net_ops = { .size = sizeof(struct clusterip_net), };
+struct notifier_block cip_netdev_notifier = { + .notifier_call = clusterip_netdev_event +}; + static int __init clusterip_tg_init(void) { int ret; @@ -859,11 +871,17 @@ static int __init clusterip_tg_init(void) if (ret < 0) goto cleanup_subsys;
+ ret = register_netdevice_notifier(&cip_netdev_notifier); + if (ret < 0) + goto unregister_target; + pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION);
return 0;
+unregister_target: + xt_unregister_target(&clusterip_tg_reg); cleanup_subsys: unregister_pernet_subsys(&clusterip_net_ops); return ret; @@ -873,6 +891,7 @@ static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
+ unregister_netdevice_notifier(&cip_netdev_notifier); xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops);
From: Hui Wang john.wanghui@huawei.com
[ Upstream commit aa02ef099cff042c2a9109782ec2bf1bffc955d4 ]
nr_cpu_ids can be limited on the command line via nr_cpus=. This can break the logical package management because it results in a smaller number of packages while in kdump kernel.
Check below case: There is a two sockets system, each socket has 8 cores, which has 16 logical cpus while HT was turn on.
0 1 2 3 4 5 6 7 | 16 17 18 19 20 21 22 23 cores on socket 0 threads on socket 0 8 9 10 11 12 13 14 15 | 24 25 26 27 28 29 30 31 cores on socket 1 threads on socket 1
While starting the kdump kernel with command line option nr_cpus=16 panic was triggered on one of the cpus 24-31 eg. 26, then online cpu will be 1-15, 26(cpu 0 was disabled in kdump), ncpus will be 16 and __max_logical_packages will be 1, but actually two packages were booted on.
This issue can reproduced by set kdump option nr_cpus=<real physical core numbers>, and then trigger panic on last socket's thread, for example:
taskset -c 26 echo c > /proc/sysrq-trigger
Use total_cpus which will not be limited by nr_cpus command line to calculate the value of __max_logical_packages.
Signed-off-by: Hui Wang john.wanghui@huawei.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: guijianfeng@huawei.com Cc: wencongyang2@huawei.com Cc: douliyang1@huawei.com Cc: qiaonuohan@huawei.com Link: https://lkml.kernel.org/r/20181107023643.22174-1-john.wanghui@huawei.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f02ecaf97904..6489067b78a4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1346,7 +1346,7 @@ void __init calculate_max_logical_packages(void) * extrapolate the boot cpu's data to all packages. */ ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); - __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); + __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); }
From: AliOS system security alios_sys_security@linux.alibaba.com
[ Upstream commit 8d683dcd65c037efc9fb38c696ec9b65b306e573 ]
The iv_offset in the mapping table of crypt target is a 64bit number when IV algorithm is plain64, plain64be, essiv or benbi. It will be assigned to iv_offset of struct crypt_config, cc_sector of struct convert_context and iv_sector of struct dm_crypt_request. These structures members are defined as a sector_t. But sector_t is 32bit when CONFIG_LBDAF is not set in 32bit kernel. In this situation sector_t is not big enough to store the 64bit iv_offset.
Here is a reproducer. Prepare test image and device (loop is automatically allocated by cryptsetup):
# dd if=/dev/zero of=tst.img bs=1M count=1 # echo "tst"|cryptsetup open --type plain -c aes-xts-plain64 \ --skip 500000000000000000 tst.img test
On 32bit system (use IV offset value that overflows to 64bit; CONFIG_LBDAF if off) and device checksum is wrong:
# dmsetup table test --showkeys 0 2048 crypt aes-xts-plain64 dfa7cfe3c481f2239155739c42e539ae8f2d38f304dcc89d20b26f69daaf0933 3551657984 7:0 0
# sha256sum /dev/mapper/test 533e25c09176632b3794f35303488c4a8f3f965dffffa6ec2df347c168cb6c19 /dev/mapper/test
On 64bit system (and on 32bit system with the patch), table and checksum is now correct:
# dmsetup table test --showkeys 0 2048 crypt aes-xts-plain64 dfa7cfe3c481f2239155739c42e539ae8f2d38f304dcc89d20b26f69daaf0933 500000000000000000 7:0 0
# sha256sum /dev/mapper/test 5d16160f9d5f8c33d8051e65fdb4f003cc31cd652b5abb08f03aa6fce0df75fc /dev/mapper/test
Signed-off-by: AliOS system security alios_sys_security@linux.alibaba.com Tested-and-Reviewed-by: Milan Broz gmazyland@gmail.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/md/dm-crypt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0481223b1deb..f192ba38f332 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -49,7 +49,7 @@ struct convert_context { struct bio *bio_out; struct bvec_iter iter_in; struct bvec_iter iter_out; - sector_t cc_sector; + u64 cc_sector; atomic_t cc_pending; union { struct skcipher_request *req; @@ -81,7 +81,7 @@ struct dm_crypt_request { struct convert_context *ctx; struct scatterlist sg_in[4]; struct scatterlist sg_out[4]; - sector_t iv_sector; + u64 iv_sector; };
struct crypt_config; @@ -160,7 +160,7 @@ struct crypt_config { struct iv_lmk_private lmk; struct iv_tcw_private tcw; } iv_gen_private; - sector_t iv_offset; + u64 iv_offset; unsigned int iv_size; unsigned short int sector_size; unsigned char sector_shift;
From: Nikos Tsironis ntsironis@arrikto.com
[ Upstream commit d7e6b8dfc7bcb3f4f3a18313581f67486a725b52 ]
When using kcopyd to run callbacks through dm_kcopyd_do_callback() or submitting copy jobs with a source size of 0, the jobs are pushed directly to the complete_jobs list, which could be under processing by the kcopyd thread. As a result, the kcopyd thread can continue running completed jobs indefinitely, without releasing the CPU, as long as someone keeps submitting new completed jobs through the aforementioned paths. Processing of work items, queued for execution on the same CPU as the currently running kcopyd thread, is thus stalled for excessive amounts of time, hurting performance.
Running the following test, from the device mapper test suite [1],
dmtest run --suite snapshot -n parallel_io_to_many_snaps_N
, with 8 active snapshots, we get, in dmesg, messages like the following:
[68899.948523] BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 95s! [68899.949282] Showing busy workqueues and worker pools: [68899.949288] workqueue events: flags=0x0 [68899.949295] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 [68899.949306] pending: vmstat_shepherd, cache_reap [68899.949331] workqueue mm_percpu_wq: flags=0x8 [68899.949337] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949345] pending: vmstat_update [68899.949387] workqueue dm_bufio_cache: flags=0x8 [68899.949392] pwq 4: cpus=2 node=0 flags=0x0 nice=0 active=1/256 [68899.949400] pending: work_fn [dm_bufio] [68899.949423] workqueue kcopyd: flags=0x8 [68899.949429] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949437] pending: do_work [dm_mod] [68899.949452] workqueue kcopyd: flags=0x8 [68899.949458] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 [68899.949466] in-flight: 13:do_work [dm_mod] [68899.949474] pending: do_work [dm_mod] [68899.949487] workqueue kcopyd: flags=0x8 [68899.949493] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949501] pending: do_work [dm_mod] [68899.949515] workqueue kcopyd: flags=0x8 [68899.949521] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949529] pending: do_work [dm_mod] [68899.949541] workqueue kcopyd: flags=0x8 [68899.949547] pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 [68899.949555] pending: do_work [dm_mod] [68899.949568] pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=95s workers=4 idle: 27130 27223 1084
Fix this by splitting the complete_jobs list into two parts: A user facing part, named callback_jobs, and one used internally by kcopyd, retaining the name complete_jobs. dm_kcopyd_do_callback() and dispatch_job() now push their jobs to the callback_jobs list, which is spliced to the complete_jobs list once, every time the kcopyd thread wakes up. This prevents kcopyd from hogging the CPU indefinitely and causing workqueue stalls.
Re-running the aforementioned test:
* Workqueue stalls are eliminated * The maximum writing time among all targets is reduced from 09m37.10s to 06m04.85s and the total run time of the test is reduced from 10m43.591s to 7m19.199s
[1] https://github.com/jthornber/device-mapper-test-suite
Signed-off-by: Nikos Tsironis ntsironis@arrikto.com Signed-off-by: Ilias Tsitsimpis iliastsi@arrikto.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/md/dm-kcopyd.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 2fc4213e02b5..671c24332802 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -56,15 +56,17 @@ struct dm_kcopyd_client { atomic_t nr_jobs;
/* - * We maintain three lists of jobs: + * We maintain four lists of jobs: * * i) jobs waiting for pages * ii) jobs that have pages, and are waiting for the io to be issued. - * iii) jobs that have completed. + * iii) jobs that don't need to do any IO and just run a callback + * iv) jobs that have completed. * - * All three of these are protected by job_lock. + * All four of these are protected by job_lock. */ spinlock_t job_lock; + struct list_head callback_jobs; struct list_head complete_jobs; struct list_head io_jobs; struct list_head pages_jobs; @@ -625,6 +627,7 @@ static void do_work(struct work_struct *work) struct dm_kcopyd_client *kc = container_of(work, struct dm_kcopyd_client, kcopyd_work); struct blk_plug plug; + unsigned long flags;
/* * The order that these are called is *very* important. @@ -633,6 +636,10 @@ static void do_work(struct work_struct *work) * list. io jobs call wake when they complete and it all * starts again. */ + spin_lock_irqsave(&kc->job_lock, flags); + list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs); + spin_unlock_irqrestore(&kc->job_lock, flags); + blk_start_plug(&plug); process_jobs(&kc->complete_jobs, kc, run_complete_job); process_jobs(&kc->pages_jobs, kc, run_pages_job); @@ -650,7 +657,7 @@ static void dispatch_job(struct kcopyd_job *job) struct dm_kcopyd_client *kc = job->kc; atomic_inc(&kc->nr_jobs); if (unlikely(!job->source.count)) - push(&kc->complete_jobs, job); + push(&kc->callback_jobs, job); else if (job->pages == &zero_page_list) push(&kc->io_jobs, job); else @@ -858,7 +865,7 @@ void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err) job->read_err = read_err; job->write_err = write_err;
- push(&kc->complete_jobs, job); + push(&kc->callback_jobs, job); wake(kc); } EXPORT_SYMBOL(dm_kcopyd_do_callback); @@ -888,6 +895,7 @@ struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *thro return ERR_PTR(-ENOMEM);
spin_lock_init(&kc->job_lock); + INIT_LIST_HEAD(&kc->callback_jobs); INIT_LIST_HEAD(&kc->complete_jobs); INIT_LIST_HEAD(&kc->io_jobs); INIT_LIST_HEAD(&kc->pages_jobs); @@ -939,6 +947,7 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc) /* Wait for completion of all jobs submitted by this client. */ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
+ BUG_ON(!list_empty(&kc->callback_jobs)); BUG_ON(!list_empty(&kc->complete_jobs)); BUG_ON(!list_empty(&kc->io_jobs)); BUG_ON(!list_empty(&kc->pages_jobs));
From: Michael Petlan mpetlan@redhat.com
[ Upstream commit 51433ead1460fb3f46e1c34f68bb22fd2dd0f5d0 ]
Some 'perf stat' options do not make sense to be negated (event, cgroup), some do not have negated path implemented (metrics). Due to that, it is better to disable the "no-" prefix for them, since otherwise, the later opt-parsing segfaults.
Before:
$ perf stat --no-metrics -- ls Segmentation fault (core dumped)
After:
$ perf stat --no-metrics -- ls Error: option `no-metrics' isn't available Usage: perf stat [<options>] [<command>]
Signed-off-by: Michael Petlan mpetlan@redhat.com Tested-by: Arnaldo Carvalho de Melo acme@redhat.com LPU-Reference: 1485912065.62416880.1544457604340.JavaMail.zimbra@redhat.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/builtin-stat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d097b5b47eb8..40720150ccd8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1961,7 +1961,7 @@ static int parse_metric_groups(const struct option *opt, return metricgroup__parse_groups(opt, str, &metric_events); }
-static const struct option stat_options[] = { +static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), OPT_CALLBACK('e', "event", &evsel_list, "event", @@ -2847,6 +2847,12 @@ int cmd_stat(int argc, const char **argv) return -ENOMEM;
parse_events__shrink_config_terms(); + + /* String-parsing callback-based options would segfault when negated */ + set_option_flag(stat_options, 'e', "event", PARSE_OPT_NONEG); + set_option_flag(stat_options, 'M', "metrics", PARSE_OPT_NONEG); + set_option_flag(stat_options, 'G', "cgroup", PARSE_OPT_NONEG); + argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands, (const char **) stat_usage, PARSE_OPT_STOP_AT_NON_OPTION);
From: Arnaldo Carvalho de Melo acme@redhat.com
[ Upstream commit ece9804985b57e1ccd83b1fb6288520955a29d51 ]
At some point we decided not to directly include kernel sources files when building tools/perf/, but when tools/lib/subcmd/ was forked from tools/perf it somehow ended up adding it via these two lines in its Makefile:
CFLAGS += -I$(srctree)/include/uapi CFLAGS += -I$(srctree)/include
As $(srctree) points to the kernel sources.
Removing those lines and keeping just:
CFLAGS += -I$(srctree)/tools/include/
Is enough to build tools/perf and tools/objtool.
This fixes the build when building from the sources in environments such as the Android NDK crossbuilding from a fedora:26 system:
subcmd-util.h:11:15: error: expected ',' or ';' before 'void' static inline void report(const char *prefix, const char *err, va_list params) ^ In file included from /git/perf/include/uapi/linux/stddef.h:2:0, from /git/perf/include/uapi/linux/posix_types.h:5, from /opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/sys/types.h:36, from /opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/unistd.h:33, from run-command.c:2: subcmd-util.h:18:17: error: '__no_instrument_function__' attribute applies only to functions
The /opt/android-ndk-r12b/platforms/android-24/arch-arm/usr/include/sys/types.h file that includes linux/posix_types.h ends up getting the one in the kernel sources causing the breakage. Fix it.
Test built tools/objtool/ too.
Reported-by: Jiri Olsa jolsa@kernel.org Tested-by: Jiri Olsa jolsa@kernel.org Cc: Adrian Hunter adrian.hunter@intel.com Cc: Josh Poimboeuf jpoimboe@redhat.com Cc: Namhyung Kim namhyung@kernel.org Fixes: 4b6ab94eabe4 ("perf subcmd: Create subcmd library") Link: https://lkml.kernel.org/n/tip-5lhaoecrj12t0bqwvpiu14sm@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/lib/subcmd/Makefile | 2 -- 1 file changed, 2 deletions(-)
diff --git a/tools/lib/subcmd/Makefile b/tools/lib/subcmd/Makefile index 95563b8e1ad7..ed61fb3a46c0 100644 --- a/tools/lib/subcmd/Makefile +++ b/tools/lib/subcmd/Makefile @@ -36,8 +36,6 @@ endif CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
CFLAGS += -I$(srctree)/tools/include/ -CFLAGS += -I$(srctree)/include/uapi -CFLAGS += -I$(srctree)/include
SUBCMD_IN := $(OUTPUT)libsubcmd-in.o
From: Nikos Tsironis ntsironis@arrikto.com
[ Upstream commit 721b1d98fb517ae99ab3b757021cf81db41e67be ]
kcopyd has no upper limit to the number of jobs one can allocate and issue. Under certain workloads this can lead to excessive memory usage and workqueue stalls. For example, when creating multiple dm-snapshot targets with a 4K chunk size and then writing to the origin through the page cache. Syncing the page cache causes a large number of BIOs to be issued to the dm-snapshot origin target, which itself issues an even larger (because of the BIO splitting taking place) number of kcopyd jobs.
Running the following test, from the device mapper test suite [1],
dmtest run --suite snapshot -n many_snapshots_of_same_volume_N
, with 8 active snapshots, results in the kcopyd job slab cache growing to 10G. Depending on the available system RAM this can lead to the OOM killer killing user processes:
[463.492878] kthreadd invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=1, oom_score_adj=0 [463.492894] kthreadd cpuset=/ mems_allowed=0 [463.492948] CPU: 7 PID: 2 Comm: kthreadd Not tainted 4.19.0-rc7 #3 [463.492950] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [463.492952] Call Trace: [463.492964] dump_stack+0x7d/0xbb [463.492973] dump_header+0x6b/0x2fc [463.492987] ? lockdep_hardirqs_on+0xee/0x190 [463.493012] oom_kill_process+0x302/0x370 [463.493021] out_of_memory+0x113/0x560 [463.493030] __alloc_pages_slowpath+0xf40/0x1020 [463.493055] __alloc_pages_nodemask+0x348/0x3c0 [463.493067] cache_grow_begin+0x81/0x8b0 [463.493072] ? cache_grow_begin+0x874/0x8b0 [463.493078] fallback_alloc+0x1e4/0x280 [463.493092] kmem_cache_alloc_node+0xd6/0x370 [463.493098] ? copy_process.part.31+0x1c5/0x20d0 [463.493105] copy_process.part.31+0x1c5/0x20d0 [463.493115] ? __lock_acquire+0x3cc/0x1550 [463.493121] ? __switch_to_asm+0x34/0x70 [463.493129] ? kthread_create_worker_on_cpu+0x70/0x70 [463.493135] ? finish_task_switch+0x90/0x280 [463.493165] _do_fork+0xe0/0x6d0 [463.493191] ? kthreadd+0x19f/0x220 [463.493233] kernel_thread+0x25/0x30 [463.493235] kthreadd+0x1bf/0x220 [463.493242] ? kthread_create_on_cpu+0x90/0x90 [463.493248] ret_from_fork+0x3a/0x50 [463.493279] Mem-Info: [463.493285] active_anon:20631 inactive_anon:4831 isolated_anon:0 [463.493285] active_file:80216 inactive_file:80107 isolated_file:435 [463.493285] unevictable:0 dirty:51266 writeback:109372 unstable:0 [463.493285] slab_reclaimable:31191 slab_unreclaimable:3483521 [463.493285] mapped:526 shmem:4903 pagetables:1759 bounce:0 [463.493285] free:33623 free_pcp:2392 free_cma:0 ... [463.493489] Unreclaimable slab info: [463.493513] Name Used Total [463.493522] bio-6 1028KB 1028KB [463.493525] bio-5 1028KB 1028KB [463.493528] dm_snap_pending_exception 236783KB 243789KB [463.493531] dm_exception 41KB 42KB [463.493534] bio-4 1216KB 1216KB [463.493537] bio-3 439396KB 439396KB [463.493539] kcopyd_job 6973427KB 6973427KB ... [463.494340] Out of memory: Kill process 1298 (ruby2.3) score 1 or sacrifice child [463.494673] Killed process 1298 (ruby2.3) total-vm:435740kB, anon-rss:20180kB, file-rss:4kB, shmem-rss:0kB [463.506437] oom_reaper: reaped process 1298 (ruby2.3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
Moreover, issuing a large number of kcopyd jobs results in kcopyd hogging the CPU, while processing them. As a result, processing of work items, queued for execution on the same CPU as the currently running kcopyd thread, is stalled for long periods of time, hurting performance. Running the aforementioned test we get, in dmesg, messages like the following:
[67501.194592] BUG: workqueue lockup - pool cpus=4 node=0 flags=0x0 nice=0 stuck for 27s! [67501.195586] Showing busy workqueues and worker pools: [67501.195591] workqueue events: flags=0x0 [67501.195597] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195611] pending: cache_reap [67501.195641] workqueue mm_percpu_wq: flags=0x8 [67501.195645] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195656] pending: vmstat_update [67501.195682] workqueue kblockd: flags=0x18 [67501.195687] pwq 5: cpus=2 node=0 flags=0x0 nice=-20 active=1/256 [67501.195698] pending: blk_timeout_work [67501.195753] workqueue kcopyd: flags=0x8 [67501.195757] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195768] pending: do_work [dm_mod] [67501.195802] workqueue kcopyd: flags=0x8 [67501.195806] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195817] pending: do_work [dm_mod] [67501.195834] workqueue kcopyd: flags=0x8 [67501.195838] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195848] pending: do_work [dm_mod] [67501.195881] workqueue kcopyd: flags=0x8 [67501.195885] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=1/256 [67501.195896] pending: do_work [dm_mod] [67501.195920] workqueue kcopyd: flags=0x8 [67501.195924] pwq 8: cpus=4 node=0 flags=0x0 nice=0 active=2/256 [67501.195935] in-flight: 67:do_work [dm_mod] [67501.195945] pending: do_work [dm_mod] [67501.195961] pool 8: cpus=4 node=0 flags=0x0 nice=0 hung=27s workers=3 idle: 129 23765
The root cause for these issues is the way dm-snapshot uses kcopyd. In particular, the lack of an explicit or implicit limit to the maximum number of in-flight COW jobs. The merging path is not affected because it implicitly limits the in-flight kcopyd jobs to one.
Fix these issues by using a semaphore to limit the maximum number of in-flight kcopyd jobs. We grab the semaphore before allocating a new kcopyd job in start_copy() and start_full_bio() and release it after the job finishes in copy_callback().
The initial semaphore value is configurable through a module parameter, to allow fine tuning the maximum number of in-flight COW jobs. Setting this parameter to zero initializes the semaphore to INT_MAX.
A default value of 2048 maximum in-flight kcopyd jobs was chosen. This value was decided experimentally as a trade-off between memory consumption, stalling the kernel's workqueues and maintaining a high enough throughput.
Re-running the aforementioned test:
* Workqueue stalls are eliminated * kcopyd's job slab cache uses a maximum of 130MB * The time taken by the test to write to the snapshot-origin target is reduced from 05m20.48s to 03m26.38s
[1] https://github.com/jthornber/device-mapper-test-suite
Signed-off-by: Nikos Tsironis ntsironis@arrikto.com Signed-off-by: Ilias Tsitsimpis iliastsi@arrikto.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/md/dm-snap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ae4b33d10924..36805b12661e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -19,6 +19,7 @@ #include <linux/vmalloc.h> #include <linux/log2.h> #include <linux/dm-kcopyd.h> +#include <linux/semaphore.h>
#include "dm.h"
@@ -105,6 +106,9 @@ struct dm_snapshot { /* The on disk metadata handler */ struct dm_exception_store *store;
+ /* Maximum number of in-flight COW jobs. */ + struct semaphore cow_count; + struct dm_kcopyd_client *kcopyd_client;
/* Wait for events based on state_bits */ @@ -145,6 +149,19 @@ struct dm_snapshot { #define RUNNING_MERGE 0 #define SHUTDOWN_MERGE 1
+/* + * Maximum number of chunks being copied on write. + * + * The value was decided experimentally as a trade-off between memory + * consumption, stalling the kernel's workqueues and maintaining a high enough + * throughput. + */ +#define DEFAULT_COW_THRESHOLD 2048 + +static int cow_threshold = DEFAULT_COW_THRESHOLD; +module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644); +MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write"); + DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, "A percentage of time allocated for copy on write");
@@ -1190,6 +1207,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_hash_tables; }
+ sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX); + s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(s->kcopyd_client)) { r = PTR_ERR(s->kcopyd_client); @@ -1575,6 +1594,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context) rb_link_node(&pe->out_of_order_node, parent, p); rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree); } + up(&s->cow_count); }
/* @@ -1598,6 +1618,7 @@ static void start_copy(struct dm_snap_pending_exception *pe) dest.count = src.count;
/* Hand over to kcopyd */ + down(&s->cow_count); dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe); }
@@ -1617,6 +1638,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, pe->full_bio = bio; pe->full_bio_end_io = bio->bi_end_io;
+ down(&s->cow_count); callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client, copy_callback, pe);
From: Leo Yan leo.yan@linaro.org
[ Upstream commit 43fd56669c28cd354e9228bdb58e4bca1c1a8b66 ]
The structure cs_etm_queue uses 'prev_packet' to point to previous packet, this can be used to combine with new coming packet to generate samples.
In function cs_etm__flush() it swaps packets only when the flag 'etm->synth_opts.last_branch' is true, this means that it will not swap packets if without option '--itrace=il' to generate last branch entries; thus for this case the 'prev_packet' doesn't point to the correct previous packet and the stale packet still will be used to generate sequential sample. Thus if dump trace with 'perf script' command we can see the incorrect flow with the stale packet's address info.
This patch corrects packets swapping in cs_etm__flush(); except using the flag 'etm->synth_opts.last_branch' it also checks the another flag 'etm->sample_branches', if any flag is true then it swaps packets so can save correct content to 'prev_packet'. Finally this can fix the wrong program flow dumping issue.
The patch has a minor refactoring to use 'etm->synth_opts.last_branch' instead of 'etmq->etm->synth_opts.last_branch' for condition checking, this is consistent with that is done in cs_etm__sample().
Signed-off-by: Leo Yan leo.yan@linaro.org Reviewed-by: Mathieu Poirier mathieu.poirier@linaro.org Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Jiri Olsa jolsa@redhat.com Cc: Mike Leach mike.leach@linaro.org Cc: Namhyung Kim namhyung@kernel.org Cc: Robert Walker robert.walker@arm.com Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/1544513908-16805-2-git-send-email-leo.yan@linaro.or... Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/util/cs-etm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/util/cs-etm.c b/tools/perf/util/cs-etm.c index ca577658e890..7b5e15cc6b71 100644 --- a/tools/perf/util/cs-etm.c +++ b/tools/perf/util/cs-etm.c @@ -1005,7 +1005,7 @@ static int cs_etm__flush(struct cs_etm_queue *etmq) }
swap_packet: - if (etmq->etm->synth_opts.last_branch) { + if (etm->sample_branches || etm->synth_opts.last_branch) { /* * Swap PACKET with PREV_PACKET: PACKET becomes PREV_PACKET for * the next incoming packet.
From: Arnaldo Carvalho de Melo acme@redhat.com
[ Upstream commit 748fe0889c1ff12d378946bd5326e8ee8eacf5cf ]
There are systems such as the Android NDK API level 24 has the sigqueue() function but doesn't provide a prototype, adding noise to the build:
util/evlist.c: In function 'perf_evlist__prepare_workload': util/evlist.c:1494:4: warning: implicit declaration of function 'sigqueue' [-Wimplicit-function-declaration] if (sigqueue(getppid(), SIGUSR1, val)) ^ util/evlist.c:1494:4: warning: nested extern declaration of 'sigqueue' [-Wnested-externs]
Define a LACKS_SIGQUEUE_PROTOTYPE define so that code needing that can get a prototype.
Checked in the bionic git repo to be available since level 23:
https://android.googlesource.com/platform/bionic/+/master/libc/include/signa...
int sigqueue(pid_t __pid, int __signal, const union sigval __value) __INTRODUCED_IN(23);
Cc: Adrian Hunter adrian.hunter@intel.com Cc: Jiri Olsa jolsa@kernel.org Cc: Namhyung Kim namhyung@kernel.org Link: https://lkml.kernel.org/n/tip-lmhpev1uni9kdrv7j29glyov@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/Makefile.config | 1 + tools/perf/util/evlist.c | 4 ++++ 2 files changed, 5 insertions(+)
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index e30d20fb482d..ff7a753cc487 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -294,6 +294,7 @@ ifndef NO_BIONIC $(call feature_check,bionic) ifeq ($(feature-bionic), 1) BIONIC := 1 + CFLAGS += -DLACKS_SIGQUEUE_PROTOTYPE EXTLIBS := $(filter-out -lrt,$(EXTLIBS)) EXTLIBS := $(filter-out -lpthread,$(EXTLIBS)) endif diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index be440df29615..819aa4491b53 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -34,6 +34,10 @@ #include <linux/log2.h> #include <linux/err.h>
+#ifdef LACKS_SIGQUEUE_PROTOTYPE +int sigqueue(pid_t pid, int sig, const union sigval value); +#endif + #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) #define SID(e, x, y) xyarray__entry(e->sample_id, x, y)
From: Arnaldo Carvalho de Melo acme@redhat.com
[ Upstream commit d7a8c4a6a055097a67ccfa3ca7c9ff1b64603a70 ]
There are systems such as the Android NDK API level 24 has the open_memstream() function but doesn't provide a prototype, adding noise to the build:
builtin-timechart.c: In function 'cat_backtrace': builtin-timechart.c:486:2: warning: implicit declaration of function 'open_memstream' [-Wimplicit-function-declaration] FILE *f = open_memstream(&p, &p_len); ^ builtin-timechart.c:486:2: warning: nested extern declaration of 'open_memstream' [-Wnested-externs] builtin-timechart.c:486:12: warning: initialization makes pointer from integer without a cast FILE *f = open_memstream(&p, &p_len); ^
Define a LACKS_OPEN_MEMSTREAM_PROTOTYPE define so that code needing that can get a prototype.
Checked in the bionic git repo to be available since level 23:
https://android.googlesource.com/platform/bionic/+/master/libc/include/stdio...
FILE* open_memstream(char** __ptr, size_t* __size_ptr) __INTRODUCED_IN(23);
Cc: Adrian Hunter adrian.hunter@intel.com Cc: Jiri Olsa jolsa@kernel.org Cc: Namhyung Kim namhyung@kernel.org Link: https://lkml.kernel.org/n/tip-343ashae97e5bq6vizusyfno@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/perf/Makefile.config | 1 + tools/perf/builtin-timechart.c | 4 ++++ 2 files changed, 5 insertions(+)
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index ff7a753cc487..f00ea77f5f08 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -295,6 +295,7 @@ ifndef NO_BIONIC ifeq ($(feature-bionic), 1) BIONIC := 1 CFLAGS += -DLACKS_SIGQUEUE_PROTOTYPE + CFLAGS += -DLACKS_OPEN_MEMSTREAM_PROTOTYPE EXTLIBS := $(filter-out -lrt,$(EXTLIBS)) EXTLIBS := $(filter-out -lpthread,$(EXTLIBS)) endif diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index a827919c6263..775b99833e51 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -43,6 +43,10 @@ #include "util/data.h" #include "util/debug.h"
+#ifdef LACKS_OPEN_MEMSTREAM_PROTOTYPE +FILE *open_memstream(char **ptr, size_t *sizeloc); +#endif + #define SUPPORT_OLD_POWER_EVENTS 1 #define PWR_EVENT_EXIT -1
From: Javier Barrio javier.barrio.mart@gmail.com
[ Upstream commit 41c4f85cdac280d356df1f483000ecec4a8868be ]
Commit 1fa5efe3622db58cb8c7b9a50665e9eb9a6c7e97 (ext4: Use generic helpers for quotaon and quotaoff) made possible to call quotactl(Q_XQUOTAON/OFF) on ext4 filesystems with sysfile quota support. This leads to calling dquot_enable/disable without s_umount held in excl. mode, because quotactl_cmd_onoff checks only for Q_QUOTAON/OFF.
The following WARN_ON_ONCE triggers (in this case for dquot_enable, ext4, latest Linus' tree):
[ 117.807056] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: quota,prjquota
[...]
[ 155.036847] WARNING: CPU: 0 PID: 2343 at fs/quota/dquot.c:2469 dquot_enable+0x34/0xb9 [ 155.036851] Modules linked in: quota_v2 quota_tree ipv6 af_packet joydev mousedev psmouse serio_raw pcspkr i2c_piix4 intel_agp intel_gtt e1000 ttm drm_kms_helper drm agpgart fb_sys_fops syscopyarea sysfillrect sysimgblt i2c_core input_leds kvm_intel kvm irqbypass qemu_fw_cfg floppy evdev parport_pc parport button crc32c_generic dm_mod ata_generic pata_acpi ata_piix libata loop ext4 crc16 mbcache jbd2 usb_storage usbcore sd_mod scsi_mod [ 155.036901] CPU: 0 PID: 2343 Comm: qctl Not tainted 4.20.0-rc6-00025-gf5d582777bcb #9 [ 155.036903] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 155.036911] RIP: 0010:dquot_enable+0x34/0xb9 [ 155.036915] Code: 41 56 41 55 41 54 55 53 4c 8b 6f 28 74 02 0f 0b 4d 8d 7d 70 49 89 fc 89 cb 41 89 d6 89 f5 4c 89 ff e8 23 09 ea ff 85 c0 74 0a <0f> 0b 4c 89 ff e8 8b 09 ea ff 85 db 74 6a 41 8b b5 f8 00 00 00 0f [ 155.036918] RSP: 0018:ffffb09b00493e08 EFLAGS: 00010202 [ 155.036922] RAX: 0000000000000001 RBX: 0000000000000008 RCX: 0000000000000008 [ 155.036924] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff9781b67cd870 [ 155.036926] RBP: 0000000000000002 R08: 0000000000000000 R09: 61c8864680b583eb [ 155.036929] R10: ffffb09b00493e48 R11: ffffffffff7ce7d4 R12: ffff9781b7ee8d78 [ 155.036932] R13: ffff9781b67cd800 R14: 0000000000000004 R15: ffff9781b67cd870 [ 155.036936] FS: 00007fd813250b88(0000) GS:ffff9781ba000000(0000) knlGS:0000000000000000 [ 155.036939] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155.036942] CR2: 00007fd812ff61d6 CR3: 000000007c882000 CR4: 00000000000006b0 [ 155.036951] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155.036953] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155.036955] Call Trace: [ 155.037004] dquot_quota_enable+0x8b/0xd0 [ 155.037011] kernel_quotactl+0x628/0x74e [ 155.037027] ? do_mprotect_pkey+0x2a6/0x2cd [ 155.037034] __x64_sys_quotactl+0x1a/0x1d [ 155.037041] do_syscall_64+0x55/0xe4 [ 155.037078] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 155.037105] RIP: 0033:0x7fd812fe1198 [ 155.037109] Code: 02 77 0d 48 89 c1 48 c1 e9 3f 75 04 48 8b 04 24 48 83 c4 50 5b c3 48 83 ec 08 49 89 ca 48 63 d2 48 63 ff b8 b3 00 00 00 0f 05 <48> 89 c7 e8 c1 eb ff ff 5a c3 48 63 ff b8 bb 00 00 00 0f 05 48 89 [ 155.037112] RSP: 002b:00007ffe8cd7b050 EFLAGS: 00000206 ORIG_RAX: 00000000000000b3 [ 155.037116] RAX: ffffffffffffffda RBX: 00007ffe8cd7b148 RCX: 00007fd812fe1198 [ 155.037119] RDX: 0000000000000000 RSI: 00007ffe8cd7cea9 RDI: 0000000000580102 [ 155.037121] RBP: 00007ffe8cd7b0f0 R08: 000055fc8eba8a9d R09: 0000000000000000 [ 155.037124] R10: 00007ffe8cd7b074 R11: 0000000000000206 R12: 00007ffe8cd7b168 [ 155.037126] R13: 000055fc8eba8897 R14: 0000000000000000 R15: 0000000000000000 [ 155.037131] ---[ end trace 210f864257175c51 ]---
and then the syscall proceeds without s_umount locking.
This patch locks the superblock ->s_umount sem. in exclusive mode for all Q_XQUOTAON/OFF quotactls too in addition to Q_QUOTAON/OFF.
AFAICT, other than ext4, only xfs and ocfs2 are affected by this change. The VFS will now call in xfs_quota_* functions with s_umount held, which wasn't the case before. This looks good to me but I can not say for sure. Ext4 and ocfs2 where already beeing called with s_umount exclusive via quota_quotaon/off which is basically the same.
Signed-off-by: Javier Barrio javier.barrio.mart@gmail.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin sashal@kernel.org --- fs/quota/quota.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/quota/quota.c b/fs/quota/quota.c index f0cbf58ad4da..fd5dd806f1b9 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -791,7 +791,8 @@ static int quotactl_cmd_write(int cmd) /* Return true if quotactl command is manipulating quota on/off state */ static bool quotactl_cmd_onoff(int cmd) { - return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF); + return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) || + (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF); }
/*
From: Yangtao Li tiny.windzz@gmail.com
[ Upstream commit 5eb73c831171115d3b4347e1e7124a5a35d8086c ]
The function of_find_node_by_path() acquires a reference to the node returned by it and that reference needs to be dropped by its caller.
integrator_ap_timer_init_of() doesn't do that. The pri_node and the sec_node are used as an identifier to compare against the current node, so we can directly drop the refcount after getting the node from the path as it is not used as pointer.
By dropping the refcount right after getting it, a single variable is needed instead of two.
Fix this by use a single variable and drop the refcount right after of_find_node_by_path().
Signed-off-by: Yangtao Li tiny.windzz@gmail.com Signed-off-by: Daniel Lezcano daniel.lezcano@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/clocksource/timer-integrator-ap.c | 25 +++++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c index 62d24690ba02..9701107806a7 100644 --- a/drivers/clocksource/timer-integrator-ap.c +++ b/drivers/clocksource/timer-integrator-ap.c @@ -181,8 +181,7 @@ static int __init integrator_ap_timer_init_of(struct device_node *node) int irq; struct clk *clk; unsigned long rate; - struct device_node *pri_node; - struct device_node *sec_node; + struct device_node *alias_node;
base = of_io_request_and_map(node, 0, "integrator-timer"); if (IS_ERR(base)) @@ -204,7 +203,18 @@ static int __init integrator_ap_timer_init_of(struct device_node *node) return err; }
- pri_node = of_find_node_by_path(path); + alias_node = of_find_node_by_path(path); + + /* + * The pointer is used as an identifier not as a pointer, we + * can drop the refcount on the of__node immediately after + * getting it. + */ + of_node_put(alias_node); + + if (node == alias_node) + /* The primary timer lacks IRQ, use as clocksource */ + return integrator_clocksource_init(rate, base);
err = of_property_read_string(of_aliases, "arm,timer-secondary", &path); @@ -213,14 +223,11 @@ static int __init integrator_ap_timer_init_of(struct device_node *node) return err; }
+ alias_node = of_find_node_by_path(path);
- sec_node = of_find_node_by_path(path); - - if (node == pri_node) - /* The primary timer lacks IRQ, use as clocksource */ - return integrator_clocksource_init(rate, base); + of_node_put(alias_node);
- if (node == sec_node) { + if (node == alias_node) { /* The secondary timer will drive the clock event */ irq = irq_of_parse_and_map(node, 0); return integrator_clockevent_init(rate, base, irq);
From: Milan Broz gmazyland@gmail.com
[ Upstream commit ef87bfc24f9b8da82c89aff493df20f078bc9cb1 ]
Reference to a device in device-mapper table contains offset in sectors.
If the sector_t is 32bit integer (CONFIG_LBDAF is not set), then several device-mapper targets can overflow this offset and validity check is then performed on a wrong offset and a wrong table is activated.
See for example (on 32bit without CONFIG_LBDAF) this overflow:
# dmsetup create test --table "0 2048 linear /dev/sdg 4294967297" # dmsetup table test 0 2048 linear 8:96 1
This patch adds explicit check for overflow if the offset is sector_t type.
Signed-off-by: Milan Broz gmazyland@gmail.com Reviewed-by: Mikulas Patocka mpatocka@redhat.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-delay.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/md/dm-linear.c | 2 +- drivers/md/dm-raid1.c | 3 ++- drivers/md/dm-unstripe.c | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f192ba38f332..f2ec882f96be 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2780,7 +2780,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) }
ret = -EINVAL; - if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { + if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { ti->error = "Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 2fb7bb4304ad..fddffe251bf6 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -141,7 +141,7 @@ static int delay_class_ctr(struct dm_target *ti, struct delay_class *c, char **a unsigned long long tmpll; char dummy;
- if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { + if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { ti->error = "Invalid device sector"; return -EINVAL; } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 32aabe27b37c..b86d2439ffc7 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -213,7 +213,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) devname = dm_shift_arg(&as);
r = -EINVAL; - if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { + if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1 || tmpll != (sector_t)tmpll) { ti->error = "Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 2f7c44a006c4..caa08c4b84cd 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -45,7 +45,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) }
ret = -EINVAL; - if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) { ti->error = "Invalid device sector"; goto bad; } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 79eab1071ec2..5a51151f680d 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -943,7 +943,8 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, char dummy; int ret;
- if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { + if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1 || + offset != (sector_t)offset) { ti->error = "Invalid offset"; return -EINVAL; } diff --git a/drivers/md/dm-unstripe.c b/drivers/md/dm-unstripe.c index 954b7ab4e684..e673dacf6418 100644 --- a/drivers/md/dm-unstripe.c +++ b/drivers/md/dm-unstripe.c @@ -78,7 +78,7 @@ static int unstripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err; }
- if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1) { + if (sscanf(argv[4], "%llu%c", &start, &dummy) != 1 || start != (sector_t)start) { ti->error = "Invalid striped device offset"; goto err; }
From: Raghuram Hegde raghuram.hegde@intel.com
[ Upstream commit 2da711bcebe81209a9f2f90e145600eb1bae2b71 ]
Include the new USB product ID for Intel Bluetooth device 22260 family(CcPeak)
The /sys/kernel/debug/usb/devices portion for this device is:
T: Bus=01 Lev=01 Prnt=01 Port=02 Cnt=02 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 2.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=8087 ProdID=0029 Rev= 0.01 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms I: If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 63 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 63 Ivl=1ms
Signed-off-by: Raghuram Hegde raghuram.hegde@intel.com Signed-off-by: Chethan T N chethan.tumkur.narayan@intel.com Signed-off-by: Marcel Holtmann marcel@holtmann.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/bluetooth/btusb.c | 72 ++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 35 deletions(-)
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index cd2e5cf14ea5..77b67a5f21ee 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -343,6 +343,7 @@ static const struct usb_device_id blacklist_table[] = { /* Intel Bluetooth devices */ { USB_DEVICE(0x8087, 0x0025), .driver_info = BTUSB_INTEL_NEW }, { USB_DEVICE(0x8087, 0x0026), .driver_info = BTUSB_INTEL_NEW }, + { USB_DEVICE(0x8087, 0x0029), .driver_info = BTUSB_INTEL_NEW }, { USB_DEVICE(0x8087, 0x07da), .driver_info = BTUSB_CSR }, { USB_DEVICE(0x8087, 0x07dc), .driver_info = BTUSB_INTEL }, { USB_DEVICE(0x8087, 0x0a2a), .driver_info = BTUSB_INTEL }, @@ -2054,6 +2055,35 @@ static int btusb_send_frame_intel(struct hci_dev *hdev, struct sk_buff *skb) return -EILSEQ; }
+static bool btusb_setup_intel_new_get_fw_name(struct intel_version *ver, + struct intel_boot_params *params, + char *fw_name, size_t len, + const char *suffix) +{ + switch (ver->hw_variant) { + case 0x0b: /* SfP */ + case 0x0c: /* WsP */ + snprintf(fw_name, len, "intel/ibt-%u-%u.%s", + le16_to_cpu(ver->hw_variant), + le16_to_cpu(params->dev_revid), + suffix); + break; + case 0x11: /* JfP */ + case 0x12: /* ThP */ + case 0x13: /* HrP */ + case 0x14: /* CcP */ + snprintf(fw_name, len, "intel/ibt-%u-%u-%u.%s", + le16_to_cpu(ver->hw_variant), + le16_to_cpu(ver->hw_revision), + le16_to_cpu(ver->fw_revision), + suffix); + break; + default: + return false; + } + return true; +} + static int btusb_setup_intel_new(struct hci_dev *hdev) { struct btusb_data *data = hci_get_drvdata(hdev); @@ -2105,7 +2135,7 @@ static int btusb_setup_intel_new(struct hci_dev *hdev) case 0x11: /* JfP */ case 0x12: /* ThP */ case 0x13: /* HrP */ - case 0x14: /* QnJ, IcP */ + case 0x14: /* CcP */ break; default: bt_dev_err(hdev, "Unsupported Intel hardware variant (%u)", @@ -2189,23 +2219,9 @@ static int btusb_setup_intel_new(struct hci_dev *hdev) * ibt-<hw_variant>-<hw_revision>-<fw_revision>.sfi. * */ - switch (ver.hw_variant) { - case 0x0b: /* SfP */ - case 0x0c: /* WsP */ - snprintf(fwname, sizeof(fwname), "intel/ibt-%u-%u.sfi", - le16_to_cpu(ver.hw_variant), - le16_to_cpu(params.dev_revid)); - break; - case 0x11: /* JfP */ - case 0x12: /* ThP */ - case 0x13: /* HrP */ - case 0x14: /* QnJ, IcP */ - snprintf(fwname, sizeof(fwname), "intel/ibt-%u-%u-%u.sfi", - le16_to_cpu(ver.hw_variant), - le16_to_cpu(ver.hw_revision), - le16_to_cpu(ver.fw_revision)); - break; - default: + err = btusb_setup_intel_new_get_fw_name(&ver, ¶ms, fwname, + sizeof(fwname), "sfi"); + if (!err) { bt_dev_err(hdev, "Unsupported Intel firmware naming"); return -EINVAL; } @@ -2221,23 +2237,9 @@ static int btusb_setup_intel_new(struct hci_dev *hdev) /* Save the DDC file name for later use to apply once the firmware * downloading is done. */ - switch (ver.hw_variant) { - case 0x0b: /* SfP */ - case 0x0c: /* WsP */ - snprintf(fwname, sizeof(fwname), "intel/ibt-%u-%u.ddc", - le16_to_cpu(ver.hw_variant), - le16_to_cpu(params.dev_revid)); - break; - case 0x11: /* JfP */ - case 0x12: /* ThP */ - case 0x13: /* HrP */ - case 0x14: /* QnJ, IcP */ - snprintf(fwname, sizeof(fwname), "intel/ibt-%u-%u-%u.ddc", - le16_to_cpu(ver.hw_variant), - le16_to_cpu(ver.hw_revision), - le16_to_cpu(ver.fw_revision)); - break; - default: + err = btusb_setup_intel_new_get_fw_name(&ver, ¶ms, fwname, + sizeof(fwname), "ddc"); + if (!err) { bt_dev_err(hdev, "Unsupported Intel firmware naming"); return -EINVAL; }
From: Takashi Sakamoto o-takashi@sakamocchi.jp
[ Upstream commit 644b2e97405b0b74845e1d3c2b4fe4c34858062b ]
This commit fixes hard-coded model-id for an unit of Apogee Ensemble with a correct value. This unit uses DM1500 ASIC produced ArchWave AG (formerly known as BridgeCo AG).
I note that this model supports three modes in the number of data channels in tx/rx streams; 8 ch pairs, 10 ch pairs, 18 ch pairs. The mode is switched by Vendor-dependent AV/C command, like:
$ cd linux-firewire-utils $ ./firewire-request /dev/fw1 fcp 0x00ff000003dbeb0600000000 (8ch pairs) $ ./firewire-request /dev/fw1 fcp 0x00ff000003dbeb0601000000 (10ch pairs) $ ./firewire-request /dev/fw1 fcp 0x00ff000003dbeb0602000000 (18ch pairs)
When switching between different mode, the unit disappears from IEEE 1394 bus, then appears on the bus with different combination of stream formats. In a mode of 18 ch pairs, available sampling rate is up to 96.0 kHz, else up to 192.0 kHz.
$ ./hinawa-config-rom-printer /dev/fw1 { 'bus-info': { 'adj': False, 'bmc': True, 'chip_ID': 21474898341, 'cmc': True, 'cyc_clk_acc': 100, 'generation': 2, 'imc': True, 'isc': True, 'link_spd': 2, 'max_ROM': 1, 'max_rec': 512, 'name': '1394', 'node_vendor_ID': 987, 'pmc': False}, 'root-directory': [ ['HARDWARE_VERSION', 19], [ 'NODE_CAPABILITIES', { 'addressing': {'64': True, 'fix': True, 'prv': False}, 'misc': {'int': False, 'ms': False, 'spt': True}, 'state': { 'atn': False, 'ded': False, 'drq': True, 'elo': False, 'init': False, 'lst': True, 'off': False}, 'testing': {'bas': False, 'ext': False}}], ['VENDOR', 987], ['DESCRIPTOR', 'Apogee Electronics'], ['MODEL', 126702], ['DESCRIPTOR', 'Ensemble'], ['VERSION', 5297], [ 'UNIT', [ ['SPECIFIER_ID', 41005], ['VERSION', 65537], ['MODEL', 126702], ['DESCRIPTOR', 'Ensemble']]], [ 'DEPENDENT_INFO', [ ['SPECIFIER_ID', 2037], ['VERSION', 1], [(58, 'IMMEDIATE'), 16777159], [(59, 'IMMEDIATE'), 1048576], [(60, 'IMMEDIATE'), 16777159], [(61, 'IMMEDIATE'), 6291456]]]]}
Signed-off-by: Takashi Sakamoto o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai tiwai@suse.de Signed-off-by: Sasha Levin sashal@kernel.org --- sound/firewire/bebob/bebob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sound/firewire/bebob/bebob.c b/sound/firewire/bebob/bebob.c index 93676354f87f..de4af8a41ff0 100644 --- a/sound/firewire/bebob/bebob.c +++ b/sound/firewire/bebob/bebob.c @@ -434,7 +434,7 @@ static const struct ieee1394_device_id bebob_id_table[] = { /* Apogee Electronics, DA/AD/DD-16X (X-FireWire card) */ SND_BEBOB_DEV_ENTRY(VEN_APOGEE, 0x00010048, &spec_normal), /* Apogee Electronics, Ensemble */ - SND_BEBOB_DEV_ENTRY(VEN_APOGEE, 0x00001eee, &spec_normal), + SND_BEBOB_DEV_ENTRY(VEN_APOGEE, 0x01eeee, &spec_normal), /* ESI, Quatafire610 */ SND_BEBOB_DEV_ENTRY(VEN_ESI, 0x00010064, &spec_normal), /* AcousticReality, eARMasterOne */
From: Daniel Vetter daniel.vetter@ffwll.ch
[ Upstream commit 4f4b374332ec0ae9c738ff8ec9bed5cd97ff9adc ]
This is the much more correct fix for my earlier attempt at:
https://lkml.org/lkml/2018/12/10/118
Short recap:
- There's not actually a locking issue, it's just lockdep being a bit too eager to complain about a possible deadlock.
- Contrary to what I claimed the real problem is recursion on kn->count. Greg pointed me at sysfs_break_active_protection(), used by the scsi subsystem to allow a sysfs file to unbind itself. That would be a real deadlock, which isn't what's happening here. Also, breaking the active protection means we'd need to manually handle all the lifetime fun.
- With Rafael we discussed the task_work approach, which kinda works, but has two downsides: It's a functional change for a lockdep annotation issue, and it won't work for the bind file (which needs to get the errno from the driver load function back to userspace).
- Greg also asked why this never showed up: To hit this you need to unregister a 2nd driver from the unload code of your first driver. I guess only gpus do that. The bug has always been there, but only with a recent patch series did we add more locks so that lockdep built a chain from unbinding the snd-hda driver to the acpi_video_unregister call.
Full lockdep splat:
[12301.898799] ============================================ [12301.898805] WARNING: possible recursive locking detected [12301.898811] 4.20.0-rc7+ #84 Not tainted [12301.898815] -------------------------------------------- [12301.898821] bash/5297 is trying to acquire lock: [12301.898826] 00000000f61c6093 (kn->count#39){++++}, at: kernfs_remove_by_name_ns+0x3b/0x80 [12301.898841] but task is already holding lock: [12301.898847] 000000005f634021 (kn->count#39){++++}, at: kernfs_fop_write+0xdc/0x190 [12301.898856] other info that might help us debug this: [12301.898862] Possible unsafe locking scenario: [12301.898867] CPU0 [12301.898870] ---- [12301.898874] lock(kn->count#39); [12301.898879] lock(kn->count#39); [12301.898883] *** DEADLOCK *** [12301.898891] May be due to missing lock nesting notation [12301.898899] 5 locks held by bash/5297: [12301.898903] #0: 00000000cd800e54 (sb_writers#4){.+.+}, at: vfs_write+0x17f/0x1b0 [12301.898915] #1: 000000000465e7c2 (&of->mutex){+.+.}, at: kernfs_fop_write+0xd3/0x190 [12301.898925] #2: 000000005f634021 (kn->count#39){++++}, at: kernfs_fop_write+0xdc/0x190 [12301.898936] #3: 00000000414ef7ac (&dev->mutex){....}, at: device_release_driver_internal+0x34/0x240 [12301.898950] #4: 000000003218fbdf (register_count_mutex){+.+.}, at: acpi_video_unregister+0xe/0x40 [12301.898960] stack backtrace: [12301.898968] CPU: 1 PID: 5297 Comm: bash Not tainted 4.20.0-rc7+ #84 [12301.898974] Hardware name: Hewlett-Packard HP EliteBook 8460p/161C, BIOS 68SCF Ver. F.01 03/11/2011 [12301.898982] Call Trace: [12301.898989] dump_stack+0x67/0x9b [12301.898997] __lock_acquire+0x6ad/0x1410 [12301.899003] ? kernfs_remove_by_name_ns+0x3b/0x80 [12301.899010] ? find_held_lock+0x2d/0x90 [12301.899017] ? mutex_spin_on_owner+0xe4/0x150 [12301.899023] ? find_held_lock+0x2d/0x90 [12301.899030] ? lock_acquire+0x90/0x180 [12301.899036] lock_acquire+0x90/0x180 [12301.899042] ? kernfs_remove_by_name_ns+0x3b/0x80 [12301.899049] __kernfs_remove+0x296/0x310 [12301.899055] ? kernfs_remove_by_name_ns+0x3b/0x80 [12301.899060] ? kernfs_name_hash+0xd/0x80 [12301.899066] ? kernfs_find_ns+0x6c/0x100 [12301.899073] kernfs_remove_by_name_ns+0x3b/0x80 [12301.899080] bus_remove_driver+0x92/0xa0 [12301.899085] acpi_video_unregister+0x24/0x40 [12301.899127] i915_driver_unload+0x42/0x130 [i915] [12301.899160] i915_pci_remove+0x19/0x30 [i915] [12301.899169] pci_device_remove+0x36/0xb0 [12301.899176] device_release_driver_internal+0x185/0x240 [12301.899183] unbind_store+0xaf/0x180 [12301.899189] kernfs_fop_write+0x104/0x190 [12301.899195] __vfs_write+0x31/0x180 [12301.899203] ? rcu_read_lock_sched_held+0x6f/0x80 [12301.899209] ? rcu_sync_lockdep_assert+0x29/0x50 [12301.899216] ? __sb_start_write+0x13c/0x1a0 [12301.899221] ? vfs_write+0x17f/0x1b0 [12301.899227] vfs_write+0xb9/0x1b0 [12301.899233] ksys_write+0x50/0xc0 [12301.899239] do_syscall_64+0x4b/0x180 [12301.899247] entry_SYSCALL_64_after_hwframe+0x49/0xbe [12301.899253] RIP: 0033:0x7f452ac7f7a4 [12301.899259] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 80 00 00 00 00 8b 05 aa f0 2c 00 48 63 ff 85 c0 75 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 f3 c3 66 90 55 53 48 89 d5 48 89 f3 48 83 [12301.899273] RSP: 002b:00007ffceafa6918 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [12301.899282] RAX: ffffffffffffffda RBX: 000000000000000d RCX: 00007f452ac7f7a4 [12301.899288] RDX: 000000000000000d RSI: 00005612a1abf7c0 RDI: 0000000000000001 [12301.899295] RBP: 00005612a1abf7c0 R08: 000000000000000a R09: 00005612a1c46730 [12301.899301] R10: 000000000000000a R11: 0000000000000246 R12: 000000000000000d [12301.899308] R13: 0000000000000001 R14: 00007f452af4a740 R15: 000000000000000d
Looking around I've noticed that usb and i2c already handle similar recursion problems, where a sysfs file can unbind the same type of sysfs somewhere else in the hierarchy. Relevant commits are:
commit 356c05d58af05d582e634b54b40050c73609617b Author: Alan Stern stern@rowland.harvard.edu Date: Mon May 14 13:30:03 2012 -0400
sysfs: get rid of some lockdep false positives
commit e9b526fe704812364bca07edd15eadeba163ebfb Author: Alexander Sverdlin alexander.sverdlin@nsn.com Date: Fri May 17 14:56:35 2013 +0200
i2c: suppress lockdep warning on delete_device
Implement the same trick for driver bind/unbind.
v2: Put the macro into bus.c (Greg).
Reviewed-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Cc: Ramalingam C ramalingam.c@intel.com Cc: Arend van Spriel aspriel@gmail.com Cc: Andy Shevchenko andriy.shevchenko@linux.intel.com Cc: Geert Uytterhoeven geert+renesas@glider.be Cc: Bartosz Golaszewski brgl@bgdev.pl Cc: Heikki Krogerus heikki.krogerus@linux.intel.com Cc: Vivek Gautam vivek.gautam@codeaurora.org Cc: Joe Perches joe@perches.com Signed-off-by: Daniel Vetter daniel.vetter@intel.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/base/bus.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 8bfd27ec73d6..585e2e1c9c8f 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -31,6 +31,9 @@ static struct kset *system_kset;
#define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr)
+#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ + struct driver_attribute driver_attr_##_name = \ + __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); @@ -195,7 +198,7 @@ static ssize_t unbind_store(struct device_driver *drv, const char *buf, bus_put(bus); return err; } -static DRIVER_ATTR_WO(unbind); +static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, S_IWUSR, NULL, unbind_store);
/* * Manually attach a device to a driver. @@ -231,7 +234,7 @@ static ssize_t bind_store(struct device_driver *drv, const char *buf, bus_put(bus); return err; } -static DRIVER_ATTR_WO(bind); +static DRIVER_ATTR_IGNORE_LOCKDEP(bind, S_IWUSR, NULL, bind_store);
static ssize_t show_drivers_autoprobe(struct bus_type *bus, char *buf) {
From: Parvi Kaustubhi pkaustub@cisco.com
[ Upstream commit 8036e90f92aae2784b855a0007ae2d8154d28b3c ]
Acquiring the rtnl lock while holding usdev_lock could result in a deadlock.
For example:
usnic_ib_query_port() | mutex_lock(&us_ibdev->usdev_lock) | ib_get_eth_speed() | rtnl_lock()
rtnl_lock() | usnic_ib_netdevice_event() | mutex_lock(&us_ibdev->usdev_lock)
This commit moves the usdev_lock acquisition after the rtnl lock has been released.
This is safe to do because usdev_lock is not protecting anything being accessed in ib_get_eth_speed(). Hence, the correct order of holding locks (rtnl -> usdev_lock) is not violated.
Signed-off-by: Parvi Kaustubhi pkaustub@cisco.com Signed-off-by: Jason Gunthorpe jgg@mellanox.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 9973ac893635..3db232429630 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -334,13 +334,16 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
usnic_dbg("\n");
- mutex_lock(&us_ibdev->usdev_lock); if (ib_get_eth_speed(ibdev, port, &props->active_speed, - &props->active_width)) { - mutex_unlock(&us_ibdev->usdev_lock); + &props->active_width)) return -EINVAL; - }
+ /* + * usdev_lock is acquired after (and not before) ib_get_eth_speed call + * because acquiring rtnl_lock in ib_get_eth_speed, while holding + * usdev_lock could lead to a deadlock. + */ + mutex_lock(&us_ibdev->usdev_lock); /* props being zeroed by the caller, avoid zeroing it here */
props->lid = 0;
From: Stephan Günther moepi@moepi.net
[ Upstream commit 23c3828aa2f84edec7020c7397a22931e7a879e1 ]
With commit 09c2f95ad404 ("scsi: mpt3sas: Swap I/O memory read value back to cpu endianness"), 64bit writes in _base_writeq() were rewritten to use __raw_writeq() instad of writeq().
This introduced a bug apparent on powerpc64 systems such as the Raptor Talos II that causes the HBA to drop from the PCIe bus under heavy load and being reinitialized after a couple of seconds.
It can easily be triggered on affacted systems by using something like
fio --name=random-write --iodepth=4 --rw=randwrite --bs=4k --direct=0 \ --size=128M --numjobs=64 --end_fsync=1 fio --name=random-write --iodepth=4 --rw=randwrite --bs=64k --direct=0 \ --size=128M --numjobs=64 --end_fsync=1
a couple of times. In my case I tested it on both a ZFS raidz2 and a btrfs raid6 using LSI 9300-8i and 9400-8i controllers.
The fix consists in resembling the write ordering of writeq() by adding a mandatory write memory barrier before device access and a compiler barrier afterwards. The additional MMIO barrier is superfluous.
Signed-off-by: Stephan Günther moepi@moepi.net Reported-by: Matt Corallo linux@bluematt.me Acked-by: Sreekanth Reddy Sreekanth.Reddy@broadcom.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/scsi/mpt3sas/mpt3sas_base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 59d7844ee022..b59bba3e6516 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -3344,8 +3344,9 @@ _base_mpi_ep_writeq(__u64 b, volatile void __iomem *addr, static inline void _base_writeq(__u64 b, volatile void __iomem *addr, spinlock_t *writeq_lock) { + wmb(); __raw_writeq(b, addr); - mmiowb(); + barrier(); } #else static inline void
From: Kevin Barnett kevin.barnett@microsemi.com
[ Upstream commit 2ba55c9851d74eb015a554ef69ddf2ef061d5780 ]
Problem: The Linux kernel takes a logical volume offline after a LUN reset. This is generally accompanied by this message in the dmesg output:
Device offlined - not ready after error recovery
Root Cause: The root cause is a "quirk" in the timeout handling in the Linux SCSI layer. The Linux kernel places a 30-second timeout on most media access commands (reads and writes) that it send to device drivers. When a media access command times out, the Linux kernel goes into error recovery mode for the LUN that was the target of the command that timed out. Every command that timed out is kept on a list inside of the Linux kernel to be retried later. The kernel attempts to recover the command(s) that timed out by issuing a LUN reset followed by a TEST UNIT READY. If the LUN reset and TEST UNIT READY commands are successful, the kernel retries the command(s) that timed out.
Each SCSI command issued by the kernel has a result field associated with it. This field indicates the final result of the command (success or error). When a command times out, the kernel places a value in this result field indicating that the command timed out.
The "quirk" is that after the LUN reset and TEST UNIT READY commands are completed, the kernel checks each command on the timed-out command list before retrying it. If the result field is still "timed out", the kernel treats that command as not having been successfully recovered for a retry. If the number of commands that are in this state are greater than two, the kernel takes the LUN offline.
Fix: When our RAIDStack receives a LUN reset, it simply waits until all outstanding commands complete. Generally, all of these outstanding commands complete successfully. Therefore, the fix in the smartpqi driver is to always set the command result field to indicate success when a request completes successfully. This normally isn’t necessary because the result field is always initialized to success when the command is submitted to the driver. So when the command completes successfully, the result field is left untouched. But in this case, the kernel changes the result field behind the driver’s back and then expects the field to be changed by the driver as the commands that timed-out complete.
Reviewed-by: Dave Carroll david.carroll@microsemi.com Reviewed-by: Scott Teel scott.teel@microsemi.com Signed-off-by: Kevin Barnett kevin.barnett@microsemi.com Signed-off-by: Don Brace don.brace@microsemi.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/scsi/smartpqi/smartpqi_init.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 2112ea6723c6..c62df3eefd5a 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -2720,6 +2720,9 @@ static unsigned int pqi_process_io_intr(struct pqi_ctrl_info *ctrl_info, switch (response->header.iu_type) { case PQI_RESPONSE_IU_RAID_PATH_IO_SUCCESS: case PQI_RESPONSE_IU_AIO_PATH_IO_SUCCESS: + if (io_request->scmd) + io_request->scmd->result = 0; + /* fall through */ case PQI_RESPONSE_IU_GENERAL_MANAGEMENT: break; case PQI_RESPONSE_IU_TASK_MANAGEMENT:
From: Zhi Chen zhichen@codeaurora.org
[ Upstream commit 2d3b55853b123c177037cf534c5aaa2650310094 ]
There was a race condition in SMP that an ath10k_peer was created but its member sta was null. Following are procedures of ath10k_peer creation and member sta access in peer statistics path.
1. Peer creation: ath10k_peer_create() =>ath10k_wmi_peer_create() =>ath10k_wait_for_peer_created() ...
# another kernel path, RX from firmware ath10k_htt_t2h_msg_handler() =>ath10k_peer_map_event() =>wake_up() # ar->peer_map[id] = peer //add peer to map
#wake up original path from waiting ... # peer->sta = sta //sta assignment
2. RX path of statistics ath10k_htt_t2h_msg_handler() =>ath10k_update_per_peer_tx_stats() =>ath10k_htt_fetch_peer_stats() # peer->sta //sta accessing
Any access of peer->sta after peer was added to peer_map but before sta was assigned could cause a null pointer issue. And because these two steps are asynchronous, no proper lock can protect them. So both peer and sta need to be checked before access.
Tested: QCA9984 with firmware ver 10.4-3.9.0.1-00005 Signed-off-by: Zhi Chen zhichen@codeaurora.org Signed-off-by: Kalle Valo kvalo@codeaurora.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/net/wireless/ath/ath10k/debugfs_sta.c | 2 +- drivers/net/wireless/ath/ath10k/htt_rx.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/wireless/ath/ath10k/debugfs_sta.c b/drivers/net/wireless/ath/ath10k/debugfs_sta.c index a63c97e2c50c..6f10331e986b 100644 --- a/drivers/net/wireless/ath/ath10k/debugfs_sta.c +++ b/drivers/net/wireless/ath/ath10k/debugfs_sta.c @@ -71,7 +71,7 @@ void ath10k_sta_update_rx_tid_stats_ampdu(struct ath10k *ar, u16 peer_id, u8 tid spin_lock_bh(&ar->data_lock);
peer = ath10k_peer_find_by_id(ar, peer_id); - if (!peer) + if (!peer || !peer->sta) goto out;
arsta = (struct ath10k_sta *)peer->sta->drv_priv; diff --git a/drivers/net/wireless/ath/ath10k/htt_rx.c b/drivers/net/wireless/ath/ath10k/htt_rx.c index 4d1cd90d6d27..03d4cc6f35bc 100644 --- a/drivers/net/wireless/ath/ath10k/htt_rx.c +++ b/drivers/net/wireless/ath/ath10k/htt_rx.c @@ -2589,7 +2589,7 @@ static void ath10k_htt_fetch_peer_stats(struct ath10k *ar, rcu_read_lock(); spin_lock_bh(&ar->data_lock); peer = ath10k_peer_find_by_id(ar, peer_id); - if (!peer) { + if (!peer || !peer->sta) { ath10k_warn(ar, "Invalid peer id %d peer stats buffer\n", peer_id); goto out; @@ -2642,7 +2642,7 @@ static void ath10k_fetch_10_2_tx_stats(struct ath10k *ar, u8 *data) rcu_read_lock(); spin_lock_bh(&ar->data_lock); peer = ath10k_peer_find_by_id(ar, peer_id); - if (!peer) { + if (!peer || !peer->sta) { ath10k_warn(ar, "Invalid peer id %d in peer stats buffer\n", peer_id); goto out;
From: Yanjiang Jin yanjiang.jin@hxt-semitech.com
[ Upstream commit e57b2945aa654e48f85a41e8917793c64ecb9de8 ]
We must free all irqs during shutdown, else kexec's 2nd kernel would hang in pqi_wait_for_completion_io() as below:
Call trace:
pqi_wait_for_completion_io pqi_submit_raid_request_synchronous.constprop.78+0x23c/0x310 [smartpqi] pqi_configure_events+0xec/0x1f8 [smartpqi] pqi_ctrl_init+0x814/0xca0 [smartpqi] pqi_pci_probe+0x400/0x46c [smartpqi] local_pci_probe+0x48/0xb0 pci_device_probe+0x14c/0x1b0 really_probe+0x218/0x3fc driver_probe_device+0x70/0x140 __driver_attach+0x11c/0x134 bus_for_each_dev+0x70/0xc8 driver_attach+0x30/0x38 bus_add_driver+0x1f0/0x294 driver_register+0x74/0x12c __pci_register_driver+0x64/0x70 pqi_init+0xd0/0x10000 [smartpqi] do_one_initcall+0x60/0x1d8 do_init_module+0x64/0x1f8 load_module+0x10ec/0x1350 __se_sys_finit_module+0xd4/0x100 __arm64_sys_finit_module+0x28/0x34 el0_svc_handler+0x104/0x160 el0_svc+0x8/0xc
This happens only in the following combinations:
1. smartpqi is built as module, not built-in; 2. We have a disk connected to smartpqi card; 3. Both kexec's 1st and 2nd kernels use this disk as Rootfs' mount point.
Signed-off-by: Yanjiang Jin yanjiang.jin@hxt-semitech.com Acked-by: Don Brace don.brace@microsemi.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/scsi/smartpqi/smartpqi_init.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index c62df3eefd5a..8c1a232ac6bf 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -6689,6 +6689,7 @@ static void pqi_shutdown(struct pci_dev *pci_dev) * storage. */ rc = pqi_flush_cache(ctrl_info, SHUTDOWN); + pqi_free_interrupts(ctrl_info); pqi_reset(ctrl_info); if (rc == 0) return;
From: Qian Cai cai@lca.pw
[ Upstream commit c7a082e4242fd8cd21a441071e622f87c16bdacc ]
UBSAN reported those with MegaRAID SAS-3 3108,
[ 77.467308] UBSAN: Undefined behaviour in drivers/scsi/megaraid/megaraid_sas_fp.c:117:32 [ 77.475402] index 255 is out of range for type 'MR_LD_SPAN_MAP [1]' [ 77.481677] CPU: 16 PID: 333 Comm: kworker/16:1 Not tainted 4.20.0-rc5+ #1 [ 77.488556] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.50 06/01/2018 [ 77.495791] Workqueue: events work_for_cpu_fn [ 77.500154] Call trace: [ 77.502610] dump_backtrace+0x0/0x2c8 [ 77.506279] show_stack+0x24/0x30 [ 77.509604] dump_stack+0x118/0x19c [ 77.513098] ubsan_epilogue+0x14/0x60 [ 77.516765] __ubsan_handle_out_of_bounds+0xfc/0x13c [ 77.521767] mr_update_load_balance_params+0x150/0x158 [megaraid_sas] [ 77.528230] MR_ValidateMapInfo+0x2cc/0x10d0 [megaraid_sas] [ 77.533825] megasas_get_map_info+0x244/0x2f0 [megaraid_sas] [ 77.539505] megasas_init_adapter_fusion+0x9b0/0xf48 [megaraid_sas] [ 77.545794] megasas_init_fw+0x1ab4/0x3518 [megaraid_sas] [ 77.551212] megasas_probe_one+0x2c4/0xbe0 [megaraid_sas] [ 77.556614] local_pci_probe+0x7c/0xf0 [ 77.560365] work_for_cpu_fn+0x34/0x50 [ 77.564118] process_one_work+0x61c/0xf08 [ 77.568129] worker_thread+0x534/0xa70 [ 77.571882] kthread+0x1c8/0x1d0 [ 77.575114] ret_from_fork+0x10/0x1c
[ 89.240332] UBSAN: Undefined behaviour in drivers/scsi/megaraid/megaraid_sas_fp.c:117:32 [ 89.248426] index 255 is out of range for type 'MR_LD_SPAN_MAP [1]' [ 89.254700] CPU: 16 PID: 95 Comm: kworker/u130:0 Not tainted 4.20.0-rc5+ #1 [ 89.261665] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.50 06/01/2018 [ 89.268903] Workqueue: events_unbound async_run_entry_fn [ 89.274222] Call trace: [ 89.276680] dump_backtrace+0x0/0x2c8 [ 89.280348] show_stack+0x24/0x30 [ 89.283671] dump_stack+0x118/0x19c [ 89.287167] ubsan_epilogue+0x14/0x60 [ 89.290835] __ubsan_handle_out_of_bounds+0xfc/0x13c [ 89.295828] MR_LdRaidGet+0x50/0x58 [megaraid_sas] [ 89.300638] megasas_build_io_fusion+0xbb8/0xd90 [megaraid_sas] [ 89.306576] megasas_build_and_issue_cmd_fusion+0x138/0x460 [megaraid_sas] [ 89.313468] megasas_queue_command+0x398/0x3d0 [megaraid_sas] [ 89.319222] scsi_dispatch_cmd+0x1dc/0x8a8 [ 89.323321] scsi_request_fn+0x8e8/0xdd0 [ 89.327249] __blk_run_queue+0xc4/0x158 [ 89.331090] blk_execute_rq_nowait+0xf4/0x158 [ 89.335449] blk_execute_rq+0xdc/0x158 [ 89.339202] __scsi_execute+0x130/0x258 [ 89.343041] scsi_probe_and_add_lun+0x2fc/0x1488 [ 89.347661] __scsi_scan_target+0x1cc/0x8c8 [ 89.351848] scsi_scan_channel.part.3+0x8c/0xc0 [ 89.356382] scsi_scan_host_selected+0x130/0x1f0 [ 89.361002] do_scsi_scan_host+0xd8/0xf0 [ 89.364927] do_scan_async+0x9c/0x320 [ 89.368594] async_run_entry_fn+0x138/0x420 [ 89.372780] process_one_work+0x61c/0xf08 [ 89.376793] worker_thread+0x13c/0xa70 [ 89.380546] kthread+0x1c8/0x1d0 [ 89.383778] ret_from_fork+0x10/0x1c
This is because when populating Driver Map using firmware raid map, all non-existing VDs set their ldTgtIdToLd to 0xff, so it can be skipped later.
From drivers/scsi/megaraid/megaraid_sas_base.c ,
memset(instance->ld_ids, 0xff, MEGASAS_MAX_LD_IDS);
From drivers/scsi/megaraid/megaraid_sas_fp.c ,
/* For non existing VDs, iterate to next VD*/ if (ld >= (MAX_LOGICAL_DRIVES_EXT - 1)) continue;
However, there are a few places that failed to skip those non-existing VDs due to off-by-one errors. Then, those 0xff leaked into MR_LdRaidGet(0xff, map) and triggered the out-of-bound accesses.
Fixes: 51087a8617fe ("megaraid_sas : Extended VD support") Signed-off-by: Qian Cai cai@lca.pw Acked-by: Sumit Saxena sumit.saxena@broadcom.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/scsi/megaraid/megaraid_sas_fp.c | 2 +- drivers/scsi/megaraid/megaraid_sas_fusion.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/megaraid/megaraid_sas_fp.c b/drivers/scsi/megaraid/megaraid_sas_fp.c index 59ecbb3b53b5..a33628550425 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fp.c +++ b/drivers/scsi/megaraid/megaraid_sas_fp.c @@ -1266,7 +1266,7 @@ void mr_update_load_balance_params(struct MR_DRV_RAID_MAP_ALL *drv_map,
for (ldCount = 0; ldCount < MAX_LOGICAL_DRIVES_EXT; ldCount++) { ld = MR_TargetIdToLdGet(ldCount, drv_map); - if (ld >= MAX_LOGICAL_DRIVES_EXT) { + if (ld >= MAX_LOGICAL_DRIVES_EXT - 1) { lbInfo[ldCount].loadBalanceFlag = 0; continue; } diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c index c7f95bace353..f45c54f02bfa 100644 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c @@ -2832,7 +2832,7 @@ static void megasas_build_ld_nonrw_fusion(struct megasas_instance *instance, device_id < instance->fw_supported_vd_count)) {
ld = MR_TargetIdToLdGet(device_id, local_map_ptr); - if (ld >= instance->fw_supported_vd_count) + if (ld >= instance->fw_supported_vd_count - 1) fp_possible = 0; else { raid = MR_LdRaidGet(ld, local_map_ptr);
From: Eric Sandeen sandeen@redhat.com
[ Upstream commit 3cc31fa65d85610574c0f6a474e89f4c419923d5 ]
iomap_is_partially_uptodate() is intended to check wither blocks within the selected range of a not-uptodate page are uptodate; if the range we care about is up to date, it's an optimization.
However, the iomap implementation continues to check all blocks up to from+count, which is beyond the page, and can even be well beyond the iop->uptodate bitmap.
I think the worst that will happen is that we may eventually find a zero bit and return "not partially uptodate" when it would have otherwise returned true, and skip the optimization. Still, it's clearly an invalid memory access that must be fixed.
So: fix this by limiting the search to within the page as is done in the non-iomap variant, block_is_partially_uptodate().
Zorro noticed thiswhen KASAN went off for 512 byte blocks on a 64k page system:
BUG: KASAN: slab-out-of-bounds in iomap_is_partially_uptodate+0x1a0/0x1e0 Read of size 8 at addr ffff800120c3a318 by task fsstress/22337
Reported-by: Zorro Lang zlang@redhat.com Signed-off-by: Eric Sandeen sandeen@redhat.com Signed-off-by: Eric Sandeen sandeen@sandeen.net Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/iomap.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..e57fb1e534c5 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -488,16 +488,29 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages, } EXPORT_SYMBOL_GPL(iomap_readpages);
+/* + * iomap_is_partially_uptodate checks whether blocks within a page are + * uptodate or not. + * + * Returns true if all blocks which correspond to a file portion + * we want to read within the page are uptodate. + */ int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count) { struct iomap_page *iop = to_iomap_page(page); struct inode *inode = page->mapping->host; - unsigned first = from >> inode->i_blkbits; - unsigned last = (from + count - 1) >> inode->i_blkbits; + unsigned len, first, last; unsigned i;
+ /* Limit range to one page */ + len = min_t(unsigned, PAGE_SIZE - from, count); + + /* First and last blocks in range within page */ + first = from >> inode->i_blkbits; + last = (from + len - 1) >> inode->i_blkbits; + if (iop) { for (i = first; i <= last; i++) if (!test_bit(i, iop->uptodate))
From: Junxiao Bi junxiao.bi@oracle.com
[ Upstream commit 532e1e54c8140188e192348c790317921cb2dc1c ]
mount.ocfs2 ignore the inconsistent error that journal is clean but local alloc is unrecovered. After mount, local alloc not empty, then reserver cluster didn't alloc a new local alloc window, reserveration map is empty(ocfs2_reservation_map.m_bitmap_len = 0), that triggered the following panic.
This issue was reported at
https://oss.oracle.com/pipermail/ocfs2-devel/2015-May/010854.html
and was advised to fixed during mount. But this is a very unusual inconsistent state, usually journal dirty flag should be cleared at the last stage of umount until every other things go right. We may need do further debug to check that. Any way to avoid possible futher corruption, mount should be abort and fsck should be run.
(mount.ocfs2,1765,1):ocfs2_load_local_alloc:353 ERROR: Local alloc hasn't been recovered! found = 6518, set = 6518, taken = 8192, off = 15912372 ocfs2: Mounting device (202,64) on (node 0, slot 3) with ordered data mode. o2dlm: Joining domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 8 ) 8 nodes ocfs2: Mounting device (202,80) on (node 0, slot 3) with ordered data mode. o2hb: Region 89CEAC63CC4F4D03AC185B44E0EE0F3F (xvdf) is now a quorum device o2net: Accepted connection from node yvwsoa17p (num 7) at 172.22.77.88:7777 o2dlm: Node 7 joins domain 64FE421C8C984E6D96ED12C55FEE2435 ( 0 1 2 3 4 5 6 7 8 ) 9 nodes o2dlm: Node 7 joins domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 7 8 ) 9 nodes ------------[ cut here ]------------ kernel BUG at fs/ocfs2/reservations.c:507! invalid opcode: 0000 [#1] SMP Modules linked in: ocfs2 rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs fscache lockd grace ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sunrpc ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 ovmapi ppdev parport_pc parport xen_netfront fb_sys_fops sysimgblt sysfillrect syscopyarea acpi_cpufreq pcspkr i2c_piix4 i2c_core sg ext4 jbd2 mbcache2 sr_mod cdrom xen_blkfront pata_acpi ata_generic ata_piix floppy dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 4349 Comm: startWebLogic.s Not tainted 4.1.12-124.19.2.el6uek.x86_64 #2 Hardware name: Xen HVM domU, BIOS 4.4.4OVM 09/06/2018 task: ffff8803fb04e200 ti: ffff8800ea4d8000 task.ti: ffff8800ea4d8000 RIP: 0010:[<ffffffffa05e96a8>] [<ffffffffa05e96a8>] __ocfs2_resv_find_window+0x498/0x760 [ocfs2] Call Trace: ocfs2_resmap_resv_bits+0x10d/0x400 [ocfs2] ocfs2_claim_local_alloc_bits+0xd0/0x640 [ocfs2] __ocfs2_claim_clusters+0x178/0x360 [ocfs2] ocfs2_claim_clusters+0x1f/0x30 [ocfs2] ocfs2_convert_inline_data_to_extents+0x634/0xa60 [ocfs2] ocfs2_write_begin_nolock+0x1c6/0x1da0 [ocfs2] ocfs2_write_begin+0x13e/0x230 [ocfs2] generic_perform_write+0xbf/0x1c0 __generic_file_write_iter+0x19c/0x1d0 ocfs2_file_write_iter+0x589/0x1360 [ocfs2] __vfs_write+0xb8/0x110 vfs_write+0xa9/0x1b0 SyS_write+0x46/0xb0 system_call_fastpath+0x18/0xd7 Code: ff ff 8b 75 b8 39 75 b0 8b 45 c8 89 45 98 0f 84 e5 fe ff ff 45 8b 74 24 18 41 8b 54 24 1c e9 56 fc ff ff 85 c0 0f 85 48 ff ff ff <0f> 0b 48 8b 05 cf c3 de ff 48 ba 00 00 00 00 00 00 00 10 48 85 RIP __ocfs2_resv_find_window+0x498/0x760 [ocfs2] RSP <ffff8800ea4db668> ---[ end trace 566f07529f2edf3c ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled
Link: http://lkml.kernel.org/r/20181121020023.3034-2-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi junxiao.bi@oracle.com Reviewed-by: Yiwen Jiang jiangyiwen@huawei.com Acked-by: Joseph Qi jiangqi903@gmail.com Cc: Jun Piao piaojun@huawei.com Cc: Mark Fasheh mfasheh@versity.com Cc: Joel Becker jlbec@evilplan.org Cc: Changwei Ge ge.changwei@h3c.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/ocfs2/localalloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7642b6712c39..30208233f65b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) if (num_used || alloc->id1.bitmap1.i_used || alloc->id1.bitmap1.i_total - || la->la_bm_off) - mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + || la->la_bm_off) { + mlog(ML_ERROR, "inconsistent detected, clean journal with" + " unrecovered local alloc, please run fsck.ocfs2!\n" "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
+ status = -EINVAL; + goto bail; + } + osb->local_alloc_bh = alloc_bh; osb->local_alloc_state = OCFS2_LA_ENABLED;
From: Brian Foster bfoster@redhat.com
[ Upstream commit 3fa750dcf29e8606e3969d13d8e188cc1c0f511d ]
write_cache_pages() is used in both background and integrity writeback scenarios by various filesystems. Background writeback is mostly concerned with cleaning a certain number of dirty pages based on various mm heuristics. It may not write the full set of dirty pages or wait for I/O to complete. Integrity writeback is responsible for persisting a set of dirty pages before the writeback job completes. For example, an fsync() call must perform integrity writeback to ensure data is on disk before the call returns.
write_cache_pages() unconditionally breaks out of its processing loop in the event of a ->writepage() error. This is fine for background writeback, which had no strict requirements and will eventually come around again. This can cause problems for integrity writeback on filesystems that might need to clean up state associated with failed page writeouts. For example, XFS performs internal delayed allocation accounting before returning a ->writepage() error, where applicable. If the current writeback happens to be associated with an unmount and write_cache_pages() completes the writeback prematurely due to error, the filesystem is unmounted in an inconsistent state if dirty+delalloc pages still exist.
To handle this problem, update write_cache_pages() to always process the full set of pages for integrity writeback regardless of ->writepage() errors. Save the first encountered error and return it to the caller once complete. This facilitates XFS (or any other fs that expects integrity writeback to process the entire set of dirty pages) to clean up its internal state completely in the event of persistent mapping errors. Background writeback continues to exit on the first error encountered.
[akpm@linux-foundation.org: fix typo in comment] Link: http://lkml.kernel.org/r/20181116134304.32440-1-bfoster@redhat.com Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/page-writeback.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 84ae9bf5858a..ea4fd3af3b4b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2156,6 +2156,7 @@ int write_cache_pages(struct address_space *mapping, { int ret = 0; int done = 0; + int error; struct pagevec pvec; int nr_pages; pgoff_t uninitialized_var(writeback_index); @@ -2236,25 +2237,31 @@ int write_cache_pages(struct address_space *mapping, goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); - ret = (*writepage)(page, wbc, data); - if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { + error = (*writepage)(page, wbc, data); + if (unlikely(error)) { + /* + * Handle errors according to the type of + * writeback. There's no need to continue for + * background writeback. Just push done_index + * past this page so media errors won't choke + * writeout for the entire file. For integrity + * writeback, we must process the entire dirty + * set regardless of errors because the fs may + * still have state to clear for each page. In + * that case we continue processing and return + * the first error. + */ + if (error == AOP_WRITEPAGE_ACTIVATE) { unlock_page(page); - ret = 0; - } else { - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ + error = 0; + } else if (wbc->sync_mode != WB_SYNC_ALL) { + ret = error; done_index = page->index + 1; done = 1; break; } + if (!ret) + ret = error; }
/*
From: Aaron Lu aaron.lu@intel.com
[ Upstream commit 66f71da9dd38af17dc17209cdde7987d4679a699 ]
Since a2468cc9bfdf ("swap: choose swap device according to numa node"), avail_lists field of swap_info_struct is changed to an array with MAX_NUMNODES elements. This made swap_info_struct size increased to 40KiB and needs an order-4 page to hold it.
This is not optimal in that: 1 Most systems have way less than MAX_NUMNODES(1024) nodes so it is a waste of memory; 2 It could cause swapon failure if the swap device is swapped on after system has been running for a while, due to no order-4 page is available as pointed out by Vasily Averin.
Solve the above two issues by using nr_node_ids(which is the actual possible node number the running system has) for avail_lists instead of MAX_NUMNODES.
nr_node_ids is unknown at compile time so can't be directly used when declaring this array. What I did here is to declare avail_lists as zero element array and allocate space for it when allocating space for swap_info_struct. The reason why keep using array but not pointer is plist_for_each_entry needs the field to be part of the struct, so pointer will not work.
This patch is on top of Vasily Averin's fix commit. I think the use of kvzalloc for swap_info_struct is still needed in case nr_node_ids is really big on some systems.
Link: http://lkml.kernel.org/r/20181115083847.GA11129@intel.com Signed-off-by: Aaron Lu aaron.lu@intel.com Reviewed-by: Andrew Morton akpm@linux-foundation.org Acked-by: Michal Hocko mhocko@suse.com Cc: Vasily Averin vvs@virtuozzo.com Cc: Huang Ying ying.huang@intel.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/swap.h | 11 ++++++++++- mm/swapfile.c | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h index 8e2c11e692ba..77221c16733a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -232,7 +232,6 @@ struct swap_info_struct { unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ - struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ @@ -273,6 +272,16 @@ struct swap_info_struct { */ struct work_struct discard_work; /* discard worker */ struct swap_cluster_list discard_clusters; /* discard clusters list */ + struct plist_node avail_lists[0]; /* + * entries in swap_avail_heads, one + * entry per node. + * Must be last as the number of the + * array is nr_node_ids, which is not + * a fixed value so have to allocate + * dynamically. + * And it has to be an array so that + * plist_for_each_* can work. + */ };
#ifdef CONFIG_64BIT diff --git a/mm/swapfile.c b/mm/swapfile.c index 8810a6d7d67f..08ba73eaaba2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2819,8 +2819,9 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; unsigned int type; int i; + int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
- p = kvzalloc(sizeof(*p), GFP_KERNEL); + p = kvzalloc(size, GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM);
From: Peter Xu peterx@redhat.com
[ Upstream commit 3cfd22be0ad663248fadfc8f6ffa3e255c394552 ]
When the process being tracked does mremap() without UFFD_FEATURE_EVENT_REMAP on the corresponding tracking uffd file handle, we should not generate the remap event, and at the same time we should clear all the uffd flags on the new VMA. Without this patch, we can still have the VM_UFFD_MISSING|VM_UFFD_WP flags on the new VMA even the fault handling process does not even know the existance of the VMA.
Link: http://lkml.kernel.org/r/20181211053409.20317-1-peterx@redhat.com Signed-off-by: Peter Xu peterx@redhat.com Reviewed-by: Andrea Arcangeli aarcange@redhat.com Acked-by: Mike Rapoport rppt@linux.vnet.ibm.com Reviewed-by: William Kucharski william.kucharski@oracle.com Cc: Andrea Arcangeli aarcange@redhat.com Cc: Mike Rapoport rppt@linux.vnet.ibm.com Cc: Kirill A. Shutemov kirill@shutemov.name Cc: Hugh Dickins hughd@google.com Cc: Pavel Emelyanov xemul@virtuozzo.com Cc: Pravin Shedge pravin.shedge4linux@gmail.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/userfaultfd.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7a85e609fc27..d8b8323e80f4 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -736,10 +736,18 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx;
ctx = vma->vm_userfaultfd_ctx.ctx; - if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { + + if (!ctx) + return; + + if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); WRITE_ONCE(ctx->mmap_changing, true); + } else { + /* Drop uffd context if remap feature not enabled */ + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); } }
From: Michal Hocko mhocko@suse.com
[ Upstream commit 7550c6079846a24f30d15ac75a941c8515dbedfb ]
Patch series "THP eligibility reporting via proc".
This series of three patches aims at making THP eligibility reporting much more robust and long term sustainable. The trigger for the change is a regression report [2] and the long follow up discussion. In short the specific application didn't have good API to query whether a particular mapping can be backed by THP so it has used VMA flags to workaround that. These flags represent a deep internal state of VMAs and as such they should be used by userspace with a great deal of caution.
A similar has happened for [3] when users complained that VM_MIXEDMAP is no longer set on DAX mappings. Again a lack of a proper API led to an abuse.
The first patch in the series tries to emphasise that that the semantic of flags might change and any application consuming those should be really careful.
The remaining two patches provide a more suitable interface to address [2] and provide a consistent API to query the THP status both for each VMA and process wide as well. [1]
http://lkml.kernel.org/r/20181120103515.25280-1-mhocko@kernel.org [2] http://lkml.kernel.org/r/http://lkml.kernel.org/r/alpine.DEB.2.21.1809241054... [3] http://lkml.kernel.org/r/20181002100531.GC4135@quack2.suse.cz
This patch (of 3):
Even though vma flags exported via /proc/<pid>/smaps are explicitly documented to be not guaranteed for future compatibility the warning doesn't go far enough because it doesn't mention semantic changes to those flags. And they are important as well because these flags are a deep implementation internal to the MM code and the semantic might change at any time.
Let's consider two recent examples: http://lkml.kernel.org/r/20181002100531.GC4135@quack2.suse.cz : commit e1fb4a086495 "dax: remove VM_MIXEDMAP for fsdax and device dax" has : removed VM_MIXEDMAP flag from DAX VMAs. Now our testing shows that in the : mean time certain customer of ours started poking into /proc/<pid>/smaps : and looks at VMA flags there and if VM_MIXEDMAP is missing among the VMA : flags, the application just fails to start complaining that DAX support is : missing in the kernel.
http://lkml.kernel.org/r/alpine.DEB.2.21.1809241054050.224429@chino.kir.corp... : Commit 1860033237d4 ("mm: make PR_SET_THP_DISABLE immediately active") : introduced a regression in that userspace cannot always determine the set : of vmas where thp is ineligible. : Userspace relies on the "nh" flag being emitted as part of /proc/pid/smaps : to determine if a vma is eligible to be backed by hugepages. : Previous to this commit, prctl(PR_SET_THP_DISABLE, 1) would cause thp to : be disabled and emit "nh" as a flag for the corresponding vmas as part of : /proc/pid/smaps. After the commit, thp is disabled by means of an mm : flag and "nh" is not emitted. : This causes smaps parsing libraries to assume a vma is eligible for thp : and ends up puzzling the user on why its memory is not backed by thp.
In both cases userspace was relying on a semantic of a specific VMA flag. The primary reason why that happened is a lack of a proper interface. While this has been worked on and it will be fixed properly, it seems that our wording could see some refinement and be more vocal about semantic aspect of these flags as well.
Link: http://lkml.kernel.org/r/20181211143641.3503-2-mhocko@kernel.org Signed-off-by: Michal Hocko mhocko@suse.com Acked-by: Jan Kara jack@suse.cz Acked-by: Dan Williams dan.j.williams@intel.com Acked-by: David Rientjes rientjes@google.com Acked-by: Mike Rapoport rppt@linux.ibm.com Acked-by: Vlastimil Babka vbabka@suse.cz Cc: Dan Williams dan.j.williams@intel.com Cc: David Rientjes rientjes@google.com Cc: Paul Oppenheimer bepvte@gmail.com Cc: William Kucharski william.kucharski@oracle.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- Documentation/filesystems/proc.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22b4b00dee31..06ac6dda9b34 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -496,7 +496,9 @@ manner. The codes are the following:
Note that there is no guarantee that every flag and associated mnemonic will be present in all further kernel releases. Things get changed, the flags may -be vanished or the reverse -- new added. +be vanished or the reverse -- new added. Interpretation of their meaning +might change in future as well. So each consumer of these flags has to +follow each specific kernel version for the exact semantic.
This file is only present if the CONFIG_MMU kernel configuration option is enabled.
linux-stable-mirror@lists.linaro.org