The bug is here:
p->target_id, p->target_lun);
The list iterator 'p' will point to a bogus position containing
HEAD if the list is empty or no element is found. This case must
be checked before any use of the iterator, otherwise it will
lead to a invalid memory access.
To fix this bug, add an check. Use a new variable 'iter' as the
list iterator, while use the origin variable 'p' as a dedicated
pointer to point to the found element.
Cc: stable(a)vger.kernel.org
Fixes: ^1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong(a)gmail.com>
---
drivers/scsi/dc395x.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/dc395x.c b/drivers/scsi/dc395x.c
index c11916b8ae00..bbc03190a6f2 100644
--- a/drivers/scsi/dc395x.c
+++ b/drivers/scsi/dc395x.c
@@ -3588,10 +3588,19 @@ static struct DeviceCtlBlk *device_alloc(struct AdapterCtlBlk *acb,
#endif
if (dcb->target_lun != 0) {
/* Copy settings */
- struct DeviceCtlBlk *p;
- list_for_each_entry(p, &acb->dcb_list, list)
- if (p->target_id == dcb->target_id)
+ struct DeviceCtlBlk *p = NULL, *iter;
+
+ list_for_each_entry(iter, &acb->dcb_list, list)
+ if (iter->target_id == dcb->target_id) {
+ p = iter;
break;
+ }
+
+ if (!p) {
+ kfree(dcb);
+ return NULL;
+ }
+
dprintkdbg(DBG_1,
"device_alloc: <%02i-%i> copy from <%02i-%i>\n",
dcb->target_id, dcb->target_lun,
--
2.17.1
The three bugs are here:
__func__, s3a_buf->s3a_data->exp_id);
__func__, md_buf->metadata->exp_id);
__func__, dis_buf->dis_data->exp_id);
The list iterator 's3a_buf/md_buf/dis_buf' will point to a bogus
position containing HEAD if the list is empty or no element is found.
This case must be checked before any use of the iterator, otherwise
it will lead to a invalid memory access.
To fix this bug, add an check. Use a new variable '*_iter' as the
list iterator, while use the old variable '*_buf' as a dedicated
pointer to point to the found element.
Cc: stable(a)vger.kernel.org
Fixes: ad85094b293e4 ("Revert "media: staging: atomisp: Remove driver"")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong(a)gmail.com>
---
.../staging/media/atomisp/pci/atomisp_cmd.c | 57 ++++++++++++-------
1 file changed, 36 insertions(+), 21 deletions(-)
diff --git a/drivers/staging/media/atomisp/pci/atomisp_cmd.c b/drivers/staging/media/atomisp/pci/atomisp_cmd.c
index 97d5a528969b..0da0b69a4637 100644
--- a/drivers/staging/media/atomisp/pci/atomisp_cmd.c
+++ b/drivers/staging/media/atomisp/pci/atomisp_cmd.c
@@ -901,9 +901,9 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error,
int err;
unsigned long irqflags;
struct ia_css_frame *frame = NULL;
- struct atomisp_s3a_buf *s3a_buf = NULL, *_s3a_buf_tmp;
- struct atomisp_dis_buf *dis_buf = NULL, *_dis_buf_tmp;
- struct atomisp_metadata_buf *md_buf = NULL, *_md_buf_tmp;
+ struct atomisp_s3a_buf *s3a_buf = NULL, *_s3a_buf_tmp, *s3a_iter;
+ struct atomisp_dis_buf *dis_buf = NULL, *_dis_buf_tmp, *dis_iter;
+ struct atomisp_metadata_buf *md_buf = NULL, *_md_buf_tmp, *md_iter;
enum atomisp_metadata_type md_type;
struct atomisp_device *isp = asd->isp;
struct v4l2_control ctrl;
@@ -942,60 +942,75 @@ void atomisp_buf_done(struct atomisp_sub_device *asd, int error,
switch (buf_type) {
case IA_CSS_BUFFER_TYPE_3A_STATISTICS:
- list_for_each_entry_safe(s3a_buf, _s3a_buf_tmp,
+ list_for_each_entry_safe(s3a_iter, _s3a_buf_tmp,
&asd->s3a_stats_in_css, list) {
- if (s3a_buf->s3a_data ==
+ if (s3a_iter->s3a_data ==
buffer.css_buffer.data.stats_3a) {
- list_del_init(&s3a_buf->list);
- list_add_tail(&s3a_buf->list,
+ list_del_init(&s3a_iter->list);
+ list_add_tail(&s3a_iter->list,
&asd->s3a_stats_ready);
+ s3a_buf = s3a_iter;
break;
}
}
asd->s3a_bufs_in_css[css_pipe_id]--;
atomisp_3a_stats_ready_event(asd, buffer.css_buffer.exp_id);
- dev_dbg(isp->dev, "%s: s3a stat with exp_id %d is ready\n",
- __func__, s3a_buf->s3a_data->exp_id);
+ if (s3a_buf)
+ dev_dbg(isp->dev, "%s: s3a stat with exp_id %d is ready\n",
+ __func__, s3a_buf->s3a_data->exp_id);
+ else
+ dev_dbg(isp->dev, "%s: s3a stat is ready with no exp_id found\n",
+ __func__);
break;
case IA_CSS_BUFFER_TYPE_METADATA:
if (error)
break;
md_type = atomisp_get_metadata_type(asd, css_pipe_id);
- list_for_each_entry_safe(md_buf, _md_buf_tmp,
+ list_for_each_entry_safe(md_iter, _md_buf_tmp,
&asd->metadata_in_css[md_type], list) {
- if (md_buf->metadata ==
+ if (md_iter->metadata ==
buffer.css_buffer.data.metadata) {
- list_del_init(&md_buf->list);
- list_add_tail(&md_buf->list,
+ list_del_init(&md_iter->list);
+ list_add_tail(&md_iter->list,
&asd->metadata_ready[md_type]);
+ md_buf = md_iter;
break;
}
}
asd->metadata_bufs_in_css[stream_id][css_pipe_id]--;
atomisp_metadata_ready_event(asd, md_type);
- dev_dbg(isp->dev, "%s: metadata with exp_id %d is ready\n",
- __func__, md_buf->metadata->exp_id);
+ if (md_buf)
+ dev_dbg(isp->dev, "%s: metadata with exp_id %d is ready\n",
+ __func__, md_buf->metadata->exp_id);
+ else
+ dev_dbg(isp->dev, "%s: metadata is ready with no exp_id found\n",
+ __func__);
break;
case IA_CSS_BUFFER_TYPE_DIS_STATISTICS:
- list_for_each_entry_safe(dis_buf, _dis_buf_tmp,
+ list_for_each_entry_safe(dis_iter, _dis_buf_tmp,
&asd->dis_stats_in_css, list) {
- if (dis_buf->dis_data ==
+ if (dis_iter->dis_data ==
buffer.css_buffer.data.stats_dvs) {
spin_lock_irqsave(&asd->dis_stats_lock,
irqflags);
- list_del_init(&dis_buf->list);
- list_add(&dis_buf->list, &asd->dis_stats);
+ list_del_init(&dis_iter->list);
+ list_add(&dis_iter->list, &asd->dis_stats);
asd->params.dis_proj_data_valid = true;
spin_unlock_irqrestore(&asd->dis_stats_lock,
irqflags);
+ dis_buf = dis_iter;
break;
}
}
asd->dis_bufs_in_css--;
- dev_dbg(isp->dev, "%s: dis stat with exp_id %d is ready\n",
- __func__, dis_buf->dis_data->exp_id);
+ if (dis_buf)
+ dev_dbg(isp->dev, "%s: dis stat with exp_id %d is ready\n",
+ __func__, dis_buf->dis_data->exp_id);
+ else
+ dev_dbg(isp->dev, "%s: dis stat is ready with no exp_id found\n",
+ __func__);
break;
case IA_CSS_BUFFER_TYPE_VF_OUTPUT_FRAME:
case IA_CSS_BUFFER_TYPE_SEC_VF_OUTPUT_FRAME:
--
2.17.1
The bug is here:
if (SCB_out == scb->phys)
The list iterator 'scb' will point to a bogus position containing
HEAD if the list is empty or no element is found. This case must
be checked before any use of the iterator, otherwise it will lead
to a invalid memory access.
To fix this bug, add an check. Use a new variable 'iter' as the
list iterator, while use the old variable 'scb' as a dedicated
pointer to point to the found element.
Cc: stable(a)vger.kernel.org
Fixes: 48a3103006631 ("wd719x: Introduce Western Digital WD7193/7197/7296 PCI SCSI card driver")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong(a)gmail.com>
---
drivers/scsi/wd719x.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c
index 1a7947554581..6087ff4c05da 100644
--- a/drivers/scsi/wd719x.c
+++ b/drivers/scsi/wd719x.c
@@ -684,11 +684,15 @@ static irqreturn_t wd719x_interrupt(int irq, void *dev_id)
case WD719X_INT_SPIDERFAILED:
/* was the cmd completed a direct or SCB command? */
if (regs.bytes.OPC == WD719X_CMD_PROCESS_SCB) {
- struct wd719x_scb *scb;
- list_for_each_entry(scb, &wd->active_scbs, list)
- if (SCB_out == scb->phys)
+ struct wd719x_scb *scb = NULL, *iter;
+
+ list_for_each_entry(iter, &wd->active_scbs, list)
+ if (SCB_out == iter->phys) {
+ scb = iter;
break;
- if (SCB_out == scb->phys)
+ }
+
+ if (scb)
wd719x_interrupt_SCB(wd, regs, scb);
else
dev_err(&wd->pdev->dev, "card returned invalid SCB pointer\n");
--
2.17.1
The bug is here:
bus_flags = connector->display_info.bus_flags;
The list iterator 'connector-' will point to a bogus position containing
HEAD if the list is empty or no element is found. This case must
be checked before any use of the iterator, otherwise it will lead
to a invalid memory access.
To fix this bug, add an check. Use a new value 'iter' as the list
iterator, while use the old value 'connector' as a dedicated variable
to point to the found element.
Cc: stable(a)vger.kernel.org
Fixes: ("drm/omap: Add support for drm_panel")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong(a)gmail.com>
---
drivers/gpu/drm/omapdrm/omap_encoder.c | 14 +++++++++-----
1 file changed, 9 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/omapdrm/omap_encoder.c b/drivers/gpu/drm/omapdrm/omap_encoder.c
index 4dd05bc732da..d648ab4223b1 100644
--- a/drivers/gpu/drm/omapdrm/omap_encoder.c
+++ b/drivers/gpu/drm/omapdrm/omap_encoder.c
@@ -76,14 +76,16 @@ static void omap_encoder_mode_set(struct drm_encoder *encoder,
struct omap_encoder *omap_encoder = to_omap_encoder(encoder);
struct omap_dss_device *output = omap_encoder->output;
struct drm_device *dev = encoder->dev;
- struct drm_connector *connector;
+ struct drm_connector *connector = NULL, *iter;
struct drm_bridge *bridge;
struct videomode vm = { 0 };
u32 bus_flags;
- list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
- if (connector->encoder == encoder)
+ list_for_each_entry(iter, &dev->mode_config.connector_list, head) {
+ if (iter->encoder == encoder) {
+ connector = iter;
break;
+ }
}
drm_display_mode_to_videomode(adjusted_mode, &vm);
@@ -106,8 +108,10 @@ static void omap_encoder_mode_set(struct drm_encoder *encoder,
omap_encoder_update_videomode_flags(&vm, bus_flags);
}
- bus_flags = connector->display_info.bus_flags;
- omap_encoder_update_videomode_flags(&vm, bus_flags);
+ if (connector) {
+ bus_flags = connector->display_info.bus_flags;
+ omap_encoder_update_videomode_flags(&vm, bus_flags);
+ }
/* Set timings for all devices in the display pipeline. */
dss_mgr_set_timings(output, &vm);
--
2.17.1
The bug is here:
if (!p)
return ret;
The list iterator value 'p' will *always* be set and non-NULL by
list_for_each_entry(), so it is incorrect to assume that the iterator
value will be NULL if the list is empty or no element is found.
To fix the bug, Use a new value 'iter' as the list iterator, while use
the old value 'p' as a dedicated variable to point to the found element.
Cc: stable(a)vger.kernel.org
Fixes: dfaa973ae9605 ("KVM: PPC: Book3S HV: In H_SVM_INIT_DONE, migrate remaining normal-GFNs to secure-GFNs")
Signed-off-by: Xiaomeng Tong <xiam0nd.tong(a)gmail.com>
---
arch/powerpc/kvm/book3s_hv_uvmem.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index e414ca44839f..0cb20ee6a632 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -360,13 +360,15 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot,
struct kvm *kvm, unsigned long *gfn)
{
- struct kvmppc_uvmem_slot *p;
+ struct kvmppc_uvmem_slot *p = NULL, *iter;
bool ret = false;
unsigned long i;
- list_for_each_entry(p, &kvm->arch.uvmem_pfns, list)
- if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns)
+ list_for_each_entry(iter, &kvm->arch.uvmem_pfns, list)
+ if (*gfn >= iter->base_pfn && *gfn < iter->base_pfn + iter->nr_pfns) {
+ p = iter;
break;
+ }
if (!p)
return ret;
/*
--
2.17.1
This is the start of the stable review cycle for the 5.17.1 release.
There are 39 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sun, 27 Mar 2022 15:04:08 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.17.1-rc1…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.17.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 5.17.1-rc1
Arnd Bergmann <arnd(a)arndb.de>
nds32: fix access_ok() checks in get/put_user
Arnd Bergmann <arnd(a)arndb.de>
m68k: fix access_ok for coldfire
Bryan O'Donoghue <bryan.odonoghue(a)linaro.org>
wcn36xx: Differentiate wcn3660 from wcn3620
James Bottomley <James.Bottomley(a)HansenPartnership.com>
tpm: use try_get_ops() in tpm-space.c
Lino Sanfilippo <LinoSanfilippo(a)gmx.de>
tpm: fix reference counting for struct tpm_chip
Linus Lüssing <ll(a)simonwunderlich.de>
mac80211: fix potential double free on mesh join
Arnd Bergmann <arnd(a)arndb.de>
uaccess: fix integer overflow on access_ok()
Paul E. McKenney <paulmck(a)kernel.org>
rcu: Don't deboost before reporting expedited quiescent state
Ritesh Harjani <riteshh(a)linux.ibm.com>
jbd2: fix use-after-free of transaction_t race
Roberto Sassu <roberto.sassu(a)huawei.com>
drm/virtio: Ensure that objs is not NULL in virtio_gpu_array_put_free()
Brian Norris <briannorris(a)chromium.org>
Revert "ath: add support for special 0x0 regulatory domain"
Ismael Ferreras Morezuelas <swyterzone(a)gmail.com>
Bluetooth: btusb: Use quirk to skip HCI_FLT_CLEAR_ALL on fake CSR controllers
Ismael Ferreras Morezuelas <swyterzone(a)gmail.com>
Bluetooth: hci_sync: Add a new quirk to skip HCI_FLT_CLEAR_ALL
Larry Finger <Larry.Finger(a)lwfinger.net>
Bluetooth: btusb: Add one more Bluetooth part for the Realtek RTL8852AE
Giovanni Cabiddu <giovanni.cabiddu(a)intel.com>
crypto: qat - disable registration of algorithms
Werner Sembach <wse(a)tuxedocomputers.com>
ACPI: video: Force backlight native for Clevo NL5xRU and NL5xNU
Maximilian Luz <luzmaximilian(a)gmail.com>
ACPI: battery: Add device HID and quirk for Microsoft Surface Go 3
Mark Cilissen <mark(a)yotsuba.nl>
ACPI / x86: Work around broken XSDT on Advantech DAC-BJ01 board
Pablo Neira Ayuso <pablo(a)netfilter.org>
netfilter: nf_tables: validate registers coming from userspace.
Pablo Neira Ayuso <pablo(a)netfilter.org>
netfilter: nf_tables: initialize registers in nft_do_chain()
Stephane Graber <stgraber(a)ubuntu.com>
drivers: net: xgene: Fix regression in CRC stripping
Giacomo Guiduzzi <guiduzzi.giacomo(a)gmail.com>
ALSA: pci: fix reading of swapped values from pcmreg in AC97 codec
Jonathan Teh <jonathan.teh(a)outlook.com>
ALSA: cmipci: Restore aux vol on suspend/resume
Lars-Peter Clausen <lars(a)metafoo.de>
ALSA: usb-audio: Add mute TLV for playback volumes on RODE NT-USB
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Add stream lock during PCM reset ioctl operations
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent prealloc proc writes
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent prepare and hw_params/hw_free calls
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent read/write and buffer changes
Takashi Iwai <tiwai(a)suse.de>
ALSA: pcm: Fix races among concurrent hw_params and hw_free calls
Jason Zheng <jasonzheng2004(a)gmail.com>
ALSA: hda/realtek: Add quirk for ASUS GA402
huangwenhui <huangwenhuia(a)uniontech.com>
ALSA: hda/realtek - Fix headset mic problem for a HP machine with alc671
Tim Crawford <tcrawford(a)system76.com>
ALSA: hda/realtek: Add quirk for Clevo NP50PNJ
Tim Crawford <tcrawford(a)system76.com>
ALSA: hda/realtek: Add quirk for Clevo NP70PNJ
Reza Jahanbakhshi <reza.jahanbakhshi(a)gmail.com>
ALSA: usb-audio: add mapping for new Corsair Virtuoso SE
Takashi Iwai <tiwai(a)suse.de>
ALSA: oss: Fix PCM OSS buffer allocation overflow
Takashi Iwai <tiwai(a)suse.de>
ASoC: sti: Fix deadlock via snd_pcm_stop_xrun() call
Eric Dumazet <edumazet(a)google.com>
llc: fix netdevice reference leaks in llc_ui_bind()
Helmut Grohne <helmut(a)subdivi.de>
Bluetooth: btusb: Add another Realtek 8761BU
Tadeusz Struk <tstruk(a)gmail.com>
tpm: Fix error handling in async work
-------------
Diffstat:
Makefile | 4 +-
arch/csky/include/asm/uaccess.h | 7 +-
arch/hexagon/include/asm/uaccess.h | 18 ++---
arch/m68k/include/asm/uaccess.h | 15 ++--
arch/microblaze/include/asm/uaccess.h | 19 +----
arch/nds32/include/asm/uaccess.h | 22 ++++--
arch/x86/kernel/acpi/boot.c | 24 ++++++
drivers/acpi/battery.c | 12 +++
drivers/acpi/video_detect.c | 75 ++++++++++++++++++
drivers/bluetooth/btusb.c | 10 ++-
drivers/char/tpm/tpm-chip.c | 46 ++---------
drivers/char/tpm/tpm-dev-common.c | 8 +-
drivers/char/tpm/tpm.h | 2 +
drivers/char/tpm/tpm2-space.c | 73 +++++++++++++++++-
drivers/crypto/qat/qat_4xxx/adf_drv.c | 7 ++
drivers/crypto/qat/qat_common/qat_crypto.c | 7 ++
drivers/gpu/drm/virtio/virtgpu_gem.c | 3 +
drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 12 +--
drivers/net/wireless/ath/regd.c | 10 +--
drivers/net/wireless/ath/wcn36xx/main.c | 3 +
drivers/net/wireless/ath/wcn36xx/wcn36xx.h | 1 +
fs/jbd2/transaction.c | 41 ++++++----
include/net/bluetooth/hci.h | 10 +++
include/sound/pcm.h | 1 +
kernel/rcu/tree_plugin.h | 8 +-
net/bluetooth/hci_sync.c | 16 ++++
net/llc/af_llc.c | 8 ++
net/mac80211/cfg.c | 3 -
net/netfilter/nf_tables_api.c | 22 ++++--
net/netfilter/nf_tables_core.c | 2 +-
sound/core/oss/pcm_oss.c | 12 ++-
sound/core/oss/pcm_plugin.c | 5 +-
sound/core/pcm.c | 2 +
sound/core/pcm_lib.c | 4 +
sound/core/pcm_memory.c | 11 ++-
sound/core/pcm_native.c | 97 +++++++++++++++---------
sound/pci/ac97/ac97_codec.c | 4 +-
sound/pci/cmipci.c | 3 +-
sound/pci/hda/patch_realtek.c | 4 +
sound/soc/sti/uniperif_player.c | 6 +-
sound/soc/sti/uniperif_reader.c | 2 +-
sound/usb/mixer_maps.c | 10 +++
sound/usb/mixer_quirks.c | 7 +-
43 files changed, 475 insertions(+), 181 deletions(-)
hallo Greg
5.17.1-rc1
compiles, boots and runs on my x86_64
(Intel i5-11400, Fedora 35)
btw I get:
iwlwifi 0000:00:14.3: Direct firmware load for
iwlwifi-QuZ-a0-hr-b0-69.ucode failed with error -2
(not a regression in the 5.17-series, but compared to 5.16.x !)
Thanks
Tested-by: Ronald Warsow <rwarsow(a)gmx.de>
Ronald
The patch titled
Subject: mm: fix race between MADV_FREE reclaim and blkdev direct IO read
has been removed from the -mm tree. Its filename was
mm-fix-race-between-madv_free-reclaim-and-blkdev-direct-io-read.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Subject: mm: fix race between MADV_FREE reclaim and blkdev direct IO read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/rmap.c | 25 ++++++++++++++++++++++++-
1 file changed, 24 insertions(+), 1 deletion(-)
--- a/mm/rmap.c~mm-fix-race-between-madv_free-reclaim-and-blkdev-direct-io-read
+++ a/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct foli
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
_
Patches currently in -mm which might be from mfo(a)canonical.com are
The patch titled
Subject: ocfs2: fix crash when mount with quota enabled
has been added to the -mm tree. Its filename is
ocfs2-fix-crash-when-mount-with-quota-enabled.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/ocfs2-fix-crash-when-mount-with-q…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/ocfs2-fix-crash-when-mount-with-q…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Subject: ocfs2: fix crash when mount with quota enabled
There is a reported crash when mounting ocfs2 with quota enabled.
RIP: 0010:ocfs2_qinfo_lock_res_init+0x44/0x50 [ocfs2]
Call Trace:
<TASK>
ocfs2_local_read_info+0xb9/0x6f0 [ocfs2]
? ocfs2_local_check_quota_file+0x197/0x390 [ocfs2]
dquot_load_quota_sb+0x216/0x470
? preempt_count_add+0x68/0xa0
dquot_load_quota_inode+0x85/0x100
ocfs2_enable_quotas+0xa0/0x1c0 [ocfs2]
ocfs2_fill_super.cold+0xc8/0x1bf [ocfs2]
mount_bdev+0x185/0x1b0
? ocfs2_initialize_super.isra.0+0xf40/0xf40 [ocfs2]
legacy_get_tree+0x27/0x40
vfs_get_tree+0x25/0xb0
path_mount+0x465/0xac0
__x64_sys_mount+0x103/0x140
do_syscall_64+0x3b/0xc0
entry_SYSCALL_64_after_hwframe+0x44/0xae
</TASK>
It is caused by when initializing dqi_gqlock, the corresponding dqi_type
and dqi_sb are not properly initialized. This issue is introduced by
commit 6c85c2c72819, which wants to avoid accessing uninitialized
variables in error cases. So make global quota info properly initialized.
Link: https://lkml.kernel.org/r/20220323023644.40084-1-joseph.qi@linux.alibaba.com
Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1007141
Fixes: 6c85c2c72819 ("ocfs2: quota_local: fix possible uninitialized-variable access in ocfs2_local_read_info()")
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reported-by: Dayvison <sathlerds(a)gmail.com>
Tested-by: Valentin Vidic <vvidic(a)valentin-vidic.from.hr>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/quota_global.c | 23 ++++++++++++-----------
fs/ocfs2/quota_local.c | 2 --
2 files changed, 12 insertions(+), 13 deletions(-)
--- a/fs/ocfs2/quota_global.c~ocfs2-fix-crash-when-mount-with-quota-enabled
+++ a/fs/ocfs2/quota_global.c
@@ -337,7 +337,6 @@ void ocfs2_unlock_global_qf(struct ocfs2
/* Read information header from global quota file */
int ocfs2_global_read_info(struct super_block *sb, int type)
{
- struct inode *gqinode = NULL;
unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
GROUP_QUOTA_SYSTEM_INODE };
struct ocfs2_global_disk_dqinfo dinfo;
@@ -346,29 +345,31 @@ int ocfs2_global_read_info(struct super_
u64 pcount;
int status;
+ oinfo->dqi_gi.dqi_sb = sb;
+ oinfo->dqi_gi.dqi_type = type;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+ oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+ oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+ oinfo->dqi_gqi_bh = NULL;
+ oinfo->dqi_gqi_count = 0;
+
/* Read global header */
- gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ oinfo->dqi_gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
OCFS2_INVALID_SLOT);
- if (!gqinode) {
+ if (!oinfo->dqi_gqinode) {
mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
type);
status = -EINVAL;
goto out_err;
}
- oinfo->dqi_gi.dqi_sb = sb;
- oinfo->dqi_gi.dqi_type = type;
- oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
- oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
- oinfo->dqi_gqi_bh = NULL;
- oinfo->dqi_gqi_count = 0;
- oinfo->dqi_gqinode = gqinode;
+
status = ocfs2_lock_global_qf(oinfo, 0);
if (status < 0) {
mlog_errno(status);
goto out_err;
}
- status = ocfs2_extent_map_get_blocks(gqinode, 0, &oinfo->dqi_giblk,
+ status = ocfs2_extent_map_get_blocks(oinfo->dqi_gqinode, 0, &oinfo->dqi_giblk,
&pcount, NULL);
if (status < 0)
goto out_unlock;
--- a/fs/ocfs2/quota_local.c~ocfs2-fix-crash-when-mount-with-quota-enabled
+++ a/fs/ocfs2/quota_local.c
@@ -702,8 +702,6 @@ static int ocfs2_local_read_info(struct
info->dqi_priv = oinfo;
oinfo->dqi_type = type;
INIT_LIST_HEAD(&oinfo->dqi_chunk);
- oinfo->dqi_gqinode = NULL;
- ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
oinfo->dqi_rec = NULL;
oinfo->dqi_lqi_bh = NULL;
oinfo->dqi_libh = NULL;
_
Patches currently in -mm which might be from joseph.qi(a)linux.alibaba.com are
ocfs2-fix-crash-when-mount-with-quota-enabled.patch