The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From db212f2eb3fb7f546366777e93c8f54614d39269 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:54 -0800
Subject: [PATCH] scsi: qla2xxx: Fix loss of NVMe namespaces after driver
reload test
Driver registration of localport can race when it happens at the remote
port discovery time. Fix this by calling the registration under a mutex.
Link: https://lore.kernel.org/r/20220310092604.22950-4-njavali@marvell.com
Fixes: e84067d74301 ("scsi: qla2xxx: Add FC-NVMe F/W initialization and transport registration")
Cc: stable(a)vger.kernel.org
Reported-by: Marco Patalano <mpatalan(a)redhat.com>
Tested-by: Marco Patalano <mpatalan(a)redhat.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Arun Easi <aeasi(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 5723082d94d6..3bf5cbd754a7 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -782,8 +782,6 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
ha = vha->hw;
tmpl = &qla_nvme_fc_transport;
- WARN_ON(vha->nvme_local_port);
-
if (ql2xnvme_queues < MIN_NVME_HW_QUEUES || ql2xnvme_queues > MAX_NVME_HW_QUEUES) {
ql_log(ql_log_warn, vha, 0xfffd,
"ql2xnvme_queues=%d is out of range(MIN:%d - MAX:%d). Resetting ql2xnvme_queues to:%d\n",
@@ -797,7 +795,7 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
(uint8_t)(ha->max_qpairs ? ha->max_qpairs : 1));
ql_log(ql_log_info, vha, 0xfffb,
- "Number of NVME queues used for this port: %d\n",
+ "Number of NVME queues used for this port: %d\n",
qla_nvme_fc_transport.max_hw_queues);
pinfo.node_name = wwn_to_u64(vha->node_name);
@@ -805,13 +803,25 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
pinfo.port_role = FC_PORT_ROLE_NVME_INITIATOR;
pinfo.port_id = vha->d_id.b24;
- ql_log(ql_log_info, vha, 0xffff,
- "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
- pinfo.node_name, pinfo.port_name, pinfo.port_id);
- qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
-
- ret = nvme_fc_register_localport(&pinfo, tmpl,
- get_device(&ha->pdev->dev), &vha->nvme_local_port);
+ mutex_lock(&ha->vport_lock);
+ /*
+ * Check again for nvme_local_port to see if any other thread raced
+ * with this one and finished registration.
+ */
+ if (!vha->nvme_local_port) {
+ ql_log(ql_log_info, vha, 0xffff,
+ "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
+ pinfo.node_name, pinfo.port_name, pinfo.port_id);
+ qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
+
+ ret = nvme_fc_register_localport(&pinfo, tmpl,
+ get_device(&ha->pdev->dev),
+ &vha->nvme_local_port);
+ mutex_unlock(&ha->vport_lock);
+ } else {
+ mutex_unlock(&ha->vport_lock);
+ return 0;
+ }
if (ret) {
ql_log(ql_log_warn, vha, 0xffff,
"register_localport failed: ret=%x\n", ret);
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From db212f2eb3fb7f546366777e93c8f54614d39269 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:54 -0800
Subject: [PATCH] scsi: qla2xxx: Fix loss of NVMe namespaces after driver
reload test
Driver registration of localport can race when it happens at the remote
port discovery time. Fix this by calling the registration under a mutex.
Link: https://lore.kernel.org/r/20220310092604.22950-4-njavali@marvell.com
Fixes: e84067d74301 ("scsi: qla2xxx: Add FC-NVMe F/W initialization and transport registration")
Cc: stable(a)vger.kernel.org
Reported-by: Marco Patalano <mpatalan(a)redhat.com>
Tested-by: Marco Patalano <mpatalan(a)redhat.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Arun Easi <aeasi(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 5723082d94d6..3bf5cbd754a7 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -782,8 +782,6 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
ha = vha->hw;
tmpl = &qla_nvme_fc_transport;
- WARN_ON(vha->nvme_local_port);
-
if (ql2xnvme_queues < MIN_NVME_HW_QUEUES || ql2xnvme_queues > MAX_NVME_HW_QUEUES) {
ql_log(ql_log_warn, vha, 0xfffd,
"ql2xnvme_queues=%d is out of range(MIN:%d - MAX:%d). Resetting ql2xnvme_queues to:%d\n",
@@ -797,7 +795,7 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
(uint8_t)(ha->max_qpairs ? ha->max_qpairs : 1));
ql_log(ql_log_info, vha, 0xfffb,
- "Number of NVME queues used for this port: %d\n",
+ "Number of NVME queues used for this port: %d\n",
qla_nvme_fc_transport.max_hw_queues);
pinfo.node_name = wwn_to_u64(vha->node_name);
@@ -805,13 +803,25 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
pinfo.port_role = FC_PORT_ROLE_NVME_INITIATOR;
pinfo.port_id = vha->d_id.b24;
- ql_log(ql_log_info, vha, 0xffff,
- "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
- pinfo.node_name, pinfo.port_name, pinfo.port_id);
- qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
-
- ret = nvme_fc_register_localport(&pinfo, tmpl,
- get_device(&ha->pdev->dev), &vha->nvme_local_port);
+ mutex_lock(&ha->vport_lock);
+ /*
+ * Check again for nvme_local_port to see if any other thread raced
+ * with this one and finished registration.
+ */
+ if (!vha->nvme_local_port) {
+ ql_log(ql_log_info, vha, 0xffff,
+ "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
+ pinfo.node_name, pinfo.port_name, pinfo.port_id);
+ qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
+
+ ret = nvme_fc_register_localport(&pinfo, tmpl,
+ get_device(&ha->pdev->dev),
+ &vha->nvme_local_port);
+ mutex_unlock(&ha->vport_lock);
+ } else {
+ mutex_unlock(&ha->vport_lock);
+ return 0;
+ }
if (ret) {
ql_log(ql_log_warn, vha, 0xffff,
"register_localport failed: ret=%x\n", ret);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From db212f2eb3fb7f546366777e93c8f54614d39269 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:54 -0800
Subject: [PATCH] scsi: qla2xxx: Fix loss of NVMe namespaces after driver
reload test
Driver registration of localport can race when it happens at the remote
port discovery time. Fix this by calling the registration under a mutex.
Link: https://lore.kernel.org/r/20220310092604.22950-4-njavali@marvell.com
Fixes: e84067d74301 ("scsi: qla2xxx: Add FC-NVMe F/W initialization and transport registration")
Cc: stable(a)vger.kernel.org
Reported-by: Marco Patalano <mpatalan(a)redhat.com>
Tested-by: Marco Patalano <mpatalan(a)redhat.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Arun Easi <aeasi(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 5723082d94d6..3bf5cbd754a7 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -782,8 +782,6 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
ha = vha->hw;
tmpl = &qla_nvme_fc_transport;
- WARN_ON(vha->nvme_local_port);
-
if (ql2xnvme_queues < MIN_NVME_HW_QUEUES || ql2xnvme_queues > MAX_NVME_HW_QUEUES) {
ql_log(ql_log_warn, vha, 0xfffd,
"ql2xnvme_queues=%d is out of range(MIN:%d - MAX:%d). Resetting ql2xnvme_queues to:%d\n",
@@ -797,7 +795,7 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
(uint8_t)(ha->max_qpairs ? ha->max_qpairs : 1));
ql_log(ql_log_info, vha, 0xfffb,
- "Number of NVME queues used for this port: %d\n",
+ "Number of NVME queues used for this port: %d\n",
qla_nvme_fc_transport.max_hw_queues);
pinfo.node_name = wwn_to_u64(vha->node_name);
@@ -805,13 +803,25 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
pinfo.port_role = FC_PORT_ROLE_NVME_INITIATOR;
pinfo.port_id = vha->d_id.b24;
- ql_log(ql_log_info, vha, 0xffff,
- "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
- pinfo.node_name, pinfo.port_name, pinfo.port_id);
- qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
-
- ret = nvme_fc_register_localport(&pinfo, tmpl,
- get_device(&ha->pdev->dev), &vha->nvme_local_port);
+ mutex_lock(&ha->vport_lock);
+ /*
+ * Check again for nvme_local_port to see if any other thread raced
+ * with this one and finished registration.
+ */
+ if (!vha->nvme_local_port) {
+ ql_log(ql_log_info, vha, 0xffff,
+ "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
+ pinfo.node_name, pinfo.port_name, pinfo.port_id);
+ qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
+
+ ret = nvme_fc_register_localport(&pinfo, tmpl,
+ get_device(&ha->pdev->dev),
+ &vha->nvme_local_port);
+ mutex_unlock(&ha->vport_lock);
+ } else {
+ mutex_unlock(&ha->vport_lock);
+ return 0;
+ }
if (ret) {
ql_log(ql_log_warn, vha, 0xffff,
"register_localport failed: ret=%x\n", ret);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From db212f2eb3fb7f546366777e93c8f54614d39269 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:54 -0800
Subject: [PATCH] scsi: qla2xxx: Fix loss of NVMe namespaces after driver
reload test
Driver registration of localport can race when it happens at the remote
port discovery time. Fix this by calling the registration under a mutex.
Link: https://lore.kernel.org/r/20220310092604.22950-4-njavali@marvell.com
Fixes: e84067d74301 ("scsi: qla2xxx: Add FC-NVMe F/W initialization and transport registration")
Cc: stable(a)vger.kernel.org
Reported-by: Marco Patalano <mpatalan(a)redhat.com>
Tested-by: Marco Patalano <mpatalan(a)redhat.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Arun Easi <aeasi(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 5723082d94d6..3bf5cbd754a7 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -782,8 +782,6 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
ha = vha->hw;
tmpl = &qla_nvme_fc_transport;
- WARN_ON(vha->nvme_local_port);
-
if (ql2xnvme_queues < MIN_NVME_HW_QUEUES || ql2xnvme_queues > MAX_NVME_HW_QUEUES) {
ql_log(ql_log_warn, vha, 0xfffd,
"ql2xnvme_queues=%d is out of range(MIN:%d - MAX:%d). Resetting ql2xnvme_queues to:%d\n",
@@ -797,7 +795,7 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
(uint8_t)(ha->max_qpairs ? ha->max_qpairs : 1));
ql_log(ql_log_info, vha, 0xfffb,
- "Number of NVME queues used for this port: %d\n",
+ "Number of NVME queues used for this port: %d\n",
qla_nvme_fc_transport.max_hw_queues);
pinfo.node_name = wwn_to_u64(vha->node_name);
@@ -805,13 +803,25 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
pinfo.port_role = FC_PORT_ROLE_NVME_INITIATOR;
pinfo.port_id = vha->d_id.b24;
- ql_log(ql_log_info, vha, 0xffff,
- "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
- pinfo.node_name, pinfo.port_name, pinfo.port_id);
- qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
-
- ret = nvme_fc_register_localport(&pinfo, tmpl,
- get_device(&ha->pdev->dev), &vha->nvme_local_port);
+ mutex_lock(&ha->vport_lock);
+ /*
+ * Check again for nvme_local_port to see if any other thread raced
+ * with this one and finished registration.
+ */
+ if (!vha->nvme_local_port) {
+ ql_log(ql_log_info, vha, 0xffff,
+ "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
+ pinfo.node_name, pinfo.port_name, pinfo.port_id);
+ qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
+
+ ret = nvme_fc_register_localport(&pinfo, tmpl,
+ get_device(&ha->pdev->dev),
+ &vha->nvme_local_port);
+ mutex_unlock(&ha->vport_lock);
+ } else {
+ mutex_unlock(&ha->vport_lock);
+ return 0;
+ }
if (ret) {
ql_log(ql_log_warn, vha, 0xffff,
"register_localport failed: ret=%x\n", ret);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From db212f2eb3fb7f546366777e93c8f54614d39269 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:54 -0800
Subject: [PATCH] scsi: qla2xxx: Fix loss of NVMe namespaces after driver
reload test
Driver registration of localport can race when it happens at the remote
port discovery time. Fix this by calling the registration under a mutex.
Link: https://lore.kernel.org/r/20220310092604.22950-4-njavali@marvell.com
Fixes: e84067d74301 ("scsi: qla2xxx: Add FC-NVMe F/W initialization and transport registration")
Cc: stable(a)vger.kernel.org
Reported-by: Marco Patalano <mpatalan(a)redhat.com>
Tested-by: Marco Patalano <mpatalan(a)redhat.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Arun Easi <aeasi(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 5723082d94d6..3bf5cbd754a7 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -782,8 +782,6 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
ha = vha->hw;
tmpl = &qla_nvme_fc_transport;
- WARN_ON(vha->nvme_local_port);
-
if (ql2xnvme_queues < MIN_NVME_HW_QUEUES || ql2xnvme_queues > MAX_NVME_HW_QUEUES) {
ql_log(ql_log_warn, vha, 0xfffd,
"ql2xnvme_queues=%d is out of range(MIN:%d - MAX:%d). Resetting ql2xnvme_queues to:%d\n",
@@ -797,7 +795,7 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
(uint8_t)(ha->max_qpairs ? ha->max_qpairs : 1));
ql_log(ql_log_info, vha, 0xfffb,
- "Number of NVME queues used for this port: %d\n",
+ "Number of NVME queues used for this port: %d\n",
qla_nvme_fc_transport.max_hw_queues);
pinfo.node_name = wwn_to_u64(vha->node_name);
@@ -805,13 +803,25 @@ int qla_nvme_register_hba(struct scsi_qla_host *vha)
pinfo.port_role = FC_PORT_ROLE_NVME_INITIATOR;
pinfo.port_id = vha->d_id.b24;
- ql_log(ql_log_info, vha, 0xffff,
- "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
- pinfo.node_name, pinfo.port_name, pinfo.port_id);
- qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
-
- ret = nvme_fc_register_localport(&pinfo, tmpl,
- get_device(&ha->pdev->dev), &vha->nvme_local_port);
+ mutex_lock(&ha->vport_lock);
+ /*
+ * Check again for nvme_local_port to see if any other thread raced
+ * with this one and finished registration.
+ */
+ if (!vha->nvme_local_port) {
+ ql_log(ql_log_info, vha, 0xffff,
+ "register_localport: host-traddr=nn-0x%llx:pn-0x%llx on portID:%x\n",
+ pinfo.node_name, pinfo.port_name, pinfo.port_id);
+ qla_nvme_fc_transport.dma_boundary = vha->host->dma_boundary;
+
+ ret = nvme_fc_register_localport(&pinfo, tmpl,
+ get_device(&ha->pdev->dev),
+ &vha->nvme_local_port);
+ mutex_unlock(&ha->vport_lock);
+ } else {
+ mutex_unlock(&ha->vport_lock);
+ return 0;
+ }
if (ret) {
ql_log(ql_log_warn, vha, 0xffff,
"register_localport failed: ret=%x\n", ret);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 58ca5999e0367d131de82a75257fbfd5aed0195d Mon Sep 17 00:00:00 2001
From: Quinn Tran <qutran(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:52 -0800
Subject: [PATCH] scsi: qla2xxx: Fix incorrect reporting of task management
failure
User experienced no task management error while target device is responding
with error. The RSP_CODE field in the status IOCB is in little endian.
Driver assumes it's big endian and it picked up erroneous data.
Convert the data back to big endian as is on the wire.
Link: https://lore.kernel.org/r/20220310092604.22950-2-njavali@marvell.com
Fixes: faef62d13463 ("[SCSI] qla2xxx: Fix Task Management command asynchronous handling")
Cc: stable(a)vger.kernel.org
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Quinn Tran <qutran(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
index 092e4b5da65a..21b31d6359c8 100644
--- a/drivers/scsi/qla2xxx/qla_isr.c
+++ b/drivers/scsi/qla2xxx/qla_isr.c
@@ -2498,6 +2498,7 @@ qla24xx_tm_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, void *tsk)
iocb->u.tmf.data = QLA_FUNCTION_FAILED;
} else if ((le16_to_cpu(sts->scsi_status) &
SS_RESPONSE_INFO_LEN_VALID)) {
+ host_to_fcp_swap(sts->data, sizeof(sts->data));
if (le32_to_cpu(sts->rsp_data_len) < 4) {
ql_log(ql_log_warn, fcport->vha, 0x503b,
"Async-%s error - hdl=%x not enough response(%d).\n",
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0972252450f90db56dd5415a20e2aec21a08d036 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:56 -0800
Subject: [PATCH] scsi: qla2xxx: Fix crash during module load unload test
During purex packet handling the driver was incorrectly freeing a
pre-allocated structure. Fix this by skipping that entry.
System crashed with the following stack during a module unload test.
Call Trace:
sbitmap_init_node+0x7f/0x1e0
sbitmap_queue_init_node+0x24/0x150
blk_mq_init_bitmaps+0x3d/0xa0
blk_mq_init_tags+0x68/0x90
blk_mq_alloc_map_and_rqs+0x44/0x120
blk_mq_alloc_set_map_and_rqs+0x63/0x150
blk_mq_alloc_tag_set+0x11b/0x230
scsi_add_host_with_dma.cold+0x3f/0x245
qla2x00_probe_one+0xd5a/0x1b80 [qla2xxx]
Call Trace with slub_debug and debug kernel:
kasan_report_invalid_free+0x50/0x80
__kasan_slab_free+0x137/0x150
slab_free_freelist_hook+0xc6/0x190
kfree+0xe8/0x2e0
qla2x00_free_device+0x3bb/0x5d0 [qla2xxx]
qla2x00_remove_one+0x668/0xcf0 [qla2xxx]
Link: https://lore.kernel.org/r/20220310092604.22950-6-njavali@marvell.com
Fixes: 62e9dd177732 ("scsi: qla2xxx: Change in PUREX to handle FPIN ELS requests")
Cc: stable(a)vger.kernel.org
Reported-by: Marco Patalano <mpatalan(a)redhat.com>
Tested-by: Marco Patalano <mpatalan(a)redhat.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Arun Easi <aeasi(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 58c83525f006..9c4f2b38b34e 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -3901,6 +3901,8 @@ qla24xx_free_purex_list(struct purex_list *list)
spin_lock_irqsave(&list->lock, flags);
list_for_each_entry_safe(item, next, &list->head, list) {
list_del(&item->list);
+ if (item == &item->vha->default_item)
+ continue;
kfree(item);
}
spin_unlock_irqrestore(&list->lock, flags);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0972252450f90db56dd5415a20e2aec21a08d036 Mon Sep 17 00:00:00 2001
From: Arun Easi <aeasi(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:56 -0800
Subject: [PATCH] scsi: qla2xxx: Fix crash during module load unload test
During purex packet handling the driver was incorrectly freeing a
pre-allocated structure. Fix this by skipping that entry.
System crashed with the following stack during a module unload test.
Call Trace:
sbitmap_init_node+0x7f/0x1e0
sbitmap_queue_init_node+0x24/0x150
blk_mq_init_bitmaps+0x3d/0xa0
blk_mq_init_tags+0x68/0x90
blk_mq_alloc_map_and_rqs+0x44/0x120
blk_mq_alloc_set_map_and_rqs+0x63/0x150
blk_mq_alloc_tag_set+0x11b/0x230
scsi_add_host_with_dma.cold+0x3f/0x245
qla2x00_probe_one+0xd5a/0x1b80 [qla2xxx]
Call Trace with slub_debug and debug kernel:
kasan_report_invalid_free+0x50/0x80
__kasan_slab_free+0x137/0x150
slab_free_freelist_hook+0xc6/0x190
kfree+0xe8/0x2e0
qla2x00_free_device+0x3bb/0x5d0 [qla2xxx]
qla2x00_remove_one+0x668/0xcf0 [qla2xxx]
Link: https://lore.kernel.org/r/20220310092604.22950-6-njavali@marvell.com
Fixes: 62e9dd177732 ("scsi: qla2xxx: Change in PUREX to handle FPIN ELS requests")
Cc: stable(a)vger.kernel.org
Reported-by: Marco Patalano <mpatalan(a)redhat.com>
Tested-by: Marco Patalano <mpatalan(a)redhat.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Arun Easi <aeasi(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 58c83525f006..9c4f2b38b34e 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -3901,6 +3901,8 @@ qla24xx_free_purex_list(struct purex_list *list)
spin_lock_irqsave(&list->lock, flags);
list_for_each_entry_safe(item, next, &list->head, list) {
list_del(&item->list);
+ if (item == &item->vha->default_item)
+ continue;
kfree(item);
}
spin_unlock_irqrestore(&list->lock, flags);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 713b415726f100f6644971e75ebfe1edbef1a390 Mon Sep 17 00:00:00 2001
From: Quinn Tran <qutran(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:59 -0800
Subject: [PATCH] scsi: qla2xxx: Fix laggy FC remote port session recovery
For session recovery, driver relies on the dpc thread to initiate certain
operations. The dpc thread runs exclusively without the Mailbox interface
being occupied. A recent code change for heartbeat check via mailbox cmd 0
is preventing the dpc thread from carrying out its operation. This patch
allows the higher priority error recovery to run first before running the
lower priority heartbeat check.
Link: https://lore.kernel.org/r/20220310092604.22950-9-njavali@marvell.com
Fixes: d94d8158e184 ("scsi: qla2xxx: Add heartbeat check")
Cc: stable(a)vger.kernel.org
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Quinn Tran <qutran(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 8aa1cccebab1..d76c0e9f114c 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -4624,6 +4624,7 @@ struct qla_hw_data {
struct workqueue_struct *wq;
struct work_struct heartbeat_work;
struct qlfc_fw fw_buf;
+ unsigned long last_heartbeat_run_jiffies;
/* FCP_CMND priority support */
struct qla_fcp_prio_cfg *fcp_prio_cfg;
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 9c4f2b38b34e..81451c11eef4 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -7215,7 +7215,7 @@ static bool qla_do_heartbeat(struct scsi_qla_host *vha)
return do_heartbeat;
}
-static void qla_heart_beat(struct scsi_qla_host *vha)
+static void qla_heart_beat(struct scsi_qla_host *vha, u16 dpc_started)
{
struct qla_hw_data *ha = vha->hw;
@@ -7225,8 +7225,19 @@ static void qla_heart_beat(struct scsi_qla_host *vha)
if (vha->hw->flags.eeh_busy || qla2x00_chip_is_down(vha))
return;
- if (qla_do_heartbeat(vha))
+ /*
+ * dpc thread cannot run if heartbeat is running at the same time.
+ * We also do not want to starve heartbeat task. Therefore, do
+ * heartbeat task at least once every 5 seconds.
+ */
+ if (dpc_started &&
+ time_before(jiffies, ha->last_heartbeat_run_jiffies + 5 * HZ))
+ return;
+
+ if (qla_do_heartbeat(vha)) {
+ ha->last_heartbeat_run_jiffies = jiffies;
queue_work(ha->wq, &ha->heartbeat_work);
+ }
}
/**************************************************************************
@@ -7417,6 +7428,8 @@ qla2x00_timer(struct timer_list *t)
start_dpc++;
}
+ /* borrowing w to signify dpc will run */
+ w = 0;
/* Schedule the DPC routine if needed */
if ((test_bit(ISP_ABORT_NEEDED, &vha->dpc_flags) ||
test_bit(LOOP_RESYNC_NEEDED, &vha->dpc_flags) ||
@@ -7449,9 +7462,10 @@ qla2x00_timer(struct timer_list *t)
test_bit(RELOGIN_NEEDED, &vha->dpc_flags),
test_bit(PROCESS_PUREX_IOCB, &vha->dpc_flags));
qla2xxx_wake_dpc(vha);
+ w = 1;
}
- qla_heart_beat(vha);
+ qla_heart_beat(vha, w);
qla2x00_restart_timer(vha, WATCH_INTERVAL);
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c02aada06d19a215c8291bd968a99a270e96f734 Mon Sep 17 00:00:00 2001
From: Quinn Tran <qutran(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:58 -0800
Subject: [PATCH] scsi: qla2xxx: Fix hang due to session stuck
User experienced device lost. The log shows Get port data base command was
queued up, failed, and requeued again. Every time it is requeued, it set
the FCF_ASYNC_ACTIVE. This prevents any recovery code from occurring
because driver thinks a recovery is in progress for this session. In
essence, this session is hung. The reason it gets into this place is the
session deletion got in front of this call due to link perturbation.
Break the requeue cycle and exit. The session deletion code will trigger a
session relogin.
Link: https://lore.kernel.org/r/20220310092604.22950-8-njavali@marvell.com
Fixes: 726b85487067 ("qla2xxx: Add framework for async fabric discovery")
Cc: stable(a)vger.kernel.org
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Quinn Tran <qutran(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index bab2f665b6c2..8aa1cccebab1 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -5438,4 +5438,8 @@ struct ql_vnd_tgt_stats_resp {
#include "qla_gbl.h"
#include "qla_dbg.h"
#include "qla_inline.h"
+
+#define IS_SESSION_DELETED(_fcport) (_fcport->disc_state == DSC_DELETE_PEND || \
+ _fcport->disc_state == DSC_DELETED)
+
#endif
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index e468b05f90c0..5dfaa4d39cec 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -575,6 +575,14 @@ qla2x00_async_adisc(struct scsi_qla_host *vha, fc_port_t *fcport,
struct srb_iocb *lio;
int rval = QLA_FUNCTION_FAILED;
+ if (IS_SESSION_DELETED(fcport)) {
+ ql_log(ql_log_warn, vha, 0xffff,
+ "%s: %8phC is being delete - not sending command.\n",
+ __func__, fcport->port_name);
+ fcport->flags &= ~FCF_ASYNC_ACTIVE;
+ return rval;
+ }
+
if (!vha->flags.online || (fcport->flags & FCF_ASYNC_SENT))
return rval;
@@ -1338,8 +1346,15 @@ int qla24xx_async_gpdb(struct scsi_qla_host *vha, fc_port_t *fcport, u8 opt)
struct port_database_24xx *pd;
struct qla_hw_data *ha = vha->hw;
- if (!vha->flags.online || (fcport->flags & FCF_ASYNC_SENT) ||
- fcport->loop_id == FC_NO_LOOP_ID) {
+ if (IS_SESSION_DELETED(fcport)) {
+ ql_log(ql_log_warn, vha, 0xffff,
+ "%s: %8phC is being delete - not sending command.\n",
+ __func__, fcport->port_name);
+ fcport->flags &= ~FCF_ASYNC_ACTIVE;
+ return rval;
+ }
+
+ if (!vha->flags.online || fcport->flags & FCF_ASYNC_SENT) {
ql_log(ql_log_warn, vha, 0xffff,
"%s: %8phC online %d flags %x - not sending command.\n",
__func__, fcport->port_name, vha->flags.online, fcport->flags);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c02aada06d19a215c8291bd968a99a270e96f734 Mon Sep 17 00:00:00 2001
From: Quinn Tran <qutran(a)marvell.com>
Date: Thu, 10 Mar 2022 01:25:58 -0800
Subject: [PATCH] scsi: qla2xxx: Fix hang due to session stuck
User experienced device lost. The log shows Get port data base command was
queued up, failed, and requeued again. Every time it is requeued, it set
the FCF_ASYNC_ACTIVE. This prevents any recovery code from occurring
because driver thinks a recovery is in progress for this session. In
essence, this session is hung. The reason it gets into this place is the
session deletion got in front of this call due to link perturbation.
Break the requeue cycle and exit. The session deletion code will trigger a
session relogin.
Link: https://lore.kernel.org/r/20220310092604.22950-8-njavali@marvell.com
Fixes: 726b85487067 ("qla2xxx: Add framework for async fabric discovery")
Cc: stable(a)vger.kernel.org
Reviewed-by: Himanshu Madhani <himanshu.madhani(a)oracle.com>
Signed-off-by: Quinn Tran <qutran(a)marvell.com>
Signed-off-by: Nilesh Javali <njavali(a)marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index bab2f665b6c2..8aa1cccebab1 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -5438,4 +5438,8 @@ struct ql_vnd_tgt_stats_resp {
#include "qla_gbl.h"
#include "qla_dbg.h"
#include "qla_inline.h"
+
+#define IS_SESSION_DELETED(_fcport) (_fcport->disc_state == DSC_DELETE_PEND || \
+ _fcport->disc_state == DSC_DELETED)
+
#endif
diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
index e468b05f90c0..5dfaa4d39cec 100644
--- a/drivers/scsi/qla2xxx/qla_init.c
+++ b/drivers/scsi/qla2xxx/qla_init.c
@@ -575,6 +575,14 @@ qla2x00_async_adisc(struct scsi_qla_host *vha, fc_port_t *fcport,
struct srb_iocb *lio;
int rval = QLA_FUNCTION_FAILED;
+ if (IS_SESSION_DELETED(fcport)) {
+ ql_log(ql_log_warn, vha, 0xffff,
+ "%s: %8phC is being delete - not sending command.\n",
+ __func__, fcport->port_name);
+ fcport->flags &= ~FCF_ASYNC_ACTIVE;
+ return rval;
+ }
+
if (!vha->flags.online || (fcport->flags & FCF_ASYNC_SENT))
return rval;
@@ -1338,8 +1346,15 @@ int qla24xx_async_gpdb(struct scsi_qla_host *vha, fc_port_t *fcport, u8 opt)
struct port_database_24xx *pd;
struct qla_hw_data *ha = vha->hw;
- if (!vha->flags.online || (fcport->flags & FCF_ASYNC_SENT) ||
- fcport->loop_id == FC_NO_LOOP_ID) {
+ if (IS_SESSION_DELETED(fcport)) {
+ ql_log(ql_log_warn, vha, 0xffff,
+ "%s: %8phC is being delete - not sending command.\n",
+ __func__, fcport->port_name);
+ fcport->flags &= ~FCF_ASYNC_ACTIVE;
+ return rval;
+ }
+
+ if (!vha->flags.online || fcport->flags & FCF_ASYNC_SENT) {
ql_log(ql_log_warn, vha, 0xffff,
"%s: %8phC online %d flags %x - not sending command.\n",
__func__, fcport->port_name, vha->flags.online, fcport->flags);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d71165d3934e607070c4e48458c0cf161b1baea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 11 Mar 2022 12:47:33 +1000
Subject: [PATCH] powerpc/tm: Fix more userspace r13 corruption
Commit cf13435b730a ("powerpc/tm: Fix userspace r13 corruption") fixes a
problem in treclaim where a SLB miss can occur on the
thread_struct->ckpt_regs while SCRATCH0 is live with the saved user r13
value, clobbering it with the kernel r13 and ultimately resulting in
kernel r13 being stored in ckpt_regs.
There is an equivalent problem in trechkpt where the user r13 value is
loaded into r13 from chkpt_regs to be recheckpointed, but a SLB miss
could occur on ckpt_regs accesses after that, which will result in r13
being clobbered with a kernel value and that will get recheckpointed and
then restored to user registers.
The same memory page is accessed right before this critical window where
a SLB miss could cause corruption, so hitting the bug requires the SLB
entry be removed within a small window of instructions, which is
possible if a SLB related MCE hits there. PAPR also permits the
hypervisor to discard this SLB entry (because slb_shadow->persistent is
only set to SLB_NUM_BOLTED) although it's not known whether any
implementations would do this (KVM does not). So this is an extremely
unlikely bug, only found by inspection.
Fix this by also storing user r13 in a temporary location on the kernel
stack and don't change the r13 register from kernel r13 until the RI=0
critical section that does not fault.
The SCRATCH0 change is not strictly part of the fix, it's only used in
the RI=0 section so it does not have the same problem as the previous
SCRATCH0 bug.
Fixes: 98ae22e15b43 ("powerpc: Add helper functions for transactional memory context switching")
Cc: stable(a)vger.kernel.org # v3.9+
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Michael Neuling <mikey(a)neuling.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220311024733.48926-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3beecc32940b..5a0f023a26e9 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -443,7 +443,8 @@ restore_gprs:
REST_GPR(0, r7) /* GPR0 */
REST_GPRS(2, 4, r7) /* GPR2-4 */
- REST_GPRS(8, 31, r7) /* GPR8-31 */
+ REST_GPRS(8, 12, r7) /* GPR8-12 */
+ REST_GPRS(14, 31, r7) /* GPR14-31 */
/* Load up PPR and DSCR here so we don't run with user values for long */
mtspr SPRN_DSCR, r5
@@ -479,18 +480,24 @@ restore_gprs:
REST_GPR(6, r7)
/*
- * Store r1 and r5 on the stack so that we can access them after we
- * clear MSR RI.
+ * Store user r1 and r5 and r13 on the stack (in the unused save
+ * areas / compiler reserved areas), so that we can access them after
+ * we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
- ld r5, GPR1(r7)
+ ld r5, GPR13(r7)
std r5, -16(r1)
+ ld r5, GPR1(r7)
+ std r5, -24(r1)
REST_GPR(7, r7)
- /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
+ /* Stash the stack pointer away for use after recheckpoint */
+ std r1, PACAR1(r13)
+
+ /* Clear MSR RI since we are about to clobber r13. EE is already off */
li r5, 0
mtmsrd r5, 1
@@ -501,9 +508,9 @@ restore_gprs:
* until we turn MSR RI back on.
*/
- SET_SCRATCH0(r1)
ld r5, -8(r1)
- ld r1, -16(r1)
+ ld r13, -16(r1)
+ ld r1, -24(r1)
/* Commit register state as checkpointed state: */
TRECHKPT
@@ -519,9 +526,9 @@ restore_gprs:
*/
GET_PACA(r13)
- GET_SCRATCH0(r1)
+ ld r1, PACAR1(r13)
- /* R1 is restored, so we are recoverable again. EE is still off */
+ /* R13, R1 is restored, so we are recoverable again. EE is still off */
li r4, MSR_RI
mtmsrd r4, 1
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d71165d3934e607070c4e48458c0cf161b1baea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 11 Mar 2022 12:47:33 +1000
Subject: [PATCH] powerpc/tm: Fix more userspace r13 corruption
Commit cf13435b730a ("powerpc/tm: Fix userspace r13 corruption") fixes a
problem in treclaim where a SLB miss can occur on the
thread_struct->ckpt_regs while SCRATCH0 is live with the saved user r13
value, clobbering it with the kernel r13 and ultimately resulting in
kernel r13 being stored in ckpt_regs.
There is an equivalent problem in trechkpt where the user r13 value is
loaded into r13 from chkpt_regs to be recheckpointed, but a SLB miss
could occur on ckpt_regs accesses after that, which will result in r13
being clobbered with a kernel value and that will get recheckpointed and
then restored to user registers.
The same memory page is accessed right before this critical window where
a SLB miss could cause corruption, so hitting the bug requires the SLB
entry be removed within a small window of instructions, which is
possible if a SLB related MCE hits there. PAPR also permits the
hypervisor to discard this SLB entry (because slb_shadow->persistent is
only set to SLB_NUM_BOLTED) although it's not known whether any
implementations would do this (KVM does not). So this is an extremely
unlikely bug, only found by inspection.
Fix this by also storing user r13 in a temporary location on the kernel
stack and don't change the r13 register from kernel r13 until the RI=0
critical section that does not fault.
The SCRATCH0 change is not strictly part of the fix, it's only used in
the RI=0 section so it does not have the same problem as the previous
SCRATCH0 bug.
Fixes: 98ae22e15b43 ("powerpc: Add helper functions for transactional memory context switching")
Cc: stable(a)vger.kernel.org # v3.9+
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Michael Neuling <mikey(a)neuling.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220311024733.48926-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3beecc32940b..5a0f023a26e9 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -443,7 +443,8 @@ restore_gprs:
REST_GPR(0, r7) /* GPR0 */
REST_GPRS(2, 4, r7) /* GPR2-4 */
- REST_GPRS(8, 31, r7) /* GPR8-31 */
+ REST_GPRS(8, 12, r7) /* GPR8-12 */
+ REST_GPRS(14, 31, r7) /* GPR14-31 */
/* Load up PPR and DSCR here so we don't run with user values for long */
mtspr SPRN_DSCR, r5
@@ -479,18 +480,24 @@ restore_gprs:
REST_GPR(6, r7)
/*
- * Store r1 and r5 on the stack so that we can access them after we
- * clear MSR RI.
+ * Store user r1 and r5 and r13 on the stack (in the unused save
+ * areas / compiler reserved areas), so that we can access them after
+ * we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
- ld r5, GPR1(r7)
+ ld r5, GPR13(r7)
std r5, -16(r1)
+ ld r5, GPR1(r7)
+ std r5, -24(r1)
REST_GPR(7, r7)
- /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
+ /* Stash the stack pointer away for use after recheckpoint */
+ std r1, PACAR1(r13)
+
+ /* Clear MSR RI since we are about to clobber r13. EE is already off */
li r5, 0
mtmsrd r5, 1
@@ -501,9 +508,9 @@ restore_gprs:
* until we turn MSR RI back on.
*/
- SET_SCRATCH0(r1)
ld r5, -8(r1)
- ld r1, -16(r1)
+ ld r13, -16(r1)
+ ld r1, -24(r1)
/* Commit register state as checkpointed state: */
TRECHKPT
@@ -519,9 +526,9 @@ restore_gprs:
*/
GET_PACA(r13)
- GET_SCRATCH0(r1)
+ ld r1, PACAR1(r13)
- /* R1 is restored, so we are recoverable again. EE is still off */
+ /* R13, R1 is restored, so we are recoverable again. EE is still off */
li r4, MSR_RI
mtmsrd r4, 1
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d71165d3934e607070c4e48458c0cf161b1baea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 11 Mar 2022 12:47:33 +1000
Subject: [PATCH] powerpc/tm: Fix more userspace r13 corruption
Commit cf13435b730a ("powerpc/tm: Fix userspace r13 corruption") fixes a
problem in treclaim where a SLB miss can occur on the
thread_struct->ckpt_regs while SCRATCH0 is live with the saved user r13
value, clobbering it with the kernel r13 and ultimately resulting in
kernel r13 being stored in ckpt_regs.
There is an equivalent problem in trechkpt where the user r13 value is
loaded into r13 from chkpt_regs to be recheckpointed, but a SLB miss
could occur on ckpt_regs accesses after that, which will result in r13
being clobbered with a kernel value and that will get recheckpointed and
then restored to user registers.
The same memory page is accessed right before this critical window where
a SLB miss could cause corruption, so hitting the bug requires the SLB
entry be removed within a small window of instructions, which is
possible if a SLB related MCE hits there. PAPR also permits the
hypervisor to discard this SLB entry (because slb_shadow->persistent is
only set to SLB_NUM_BOLTED) although it's not known whether any
implementations would do this (KVM does not). So this is an extremely
unlikely bug, only found by inspection.
Fix this by also storing user r13 in a temporary location on the kernel
stack and don't change the r13 register from kernel r13 until the RI=0
critical section that does not fault.
The SCRATCH0 change is not strictly part of the fix, it's only used in
the RI=0 section so it does not have the same problem as the previous
SCRATCH0 bug.
Fixes: 98ae22e15b43 ("powerpc: Add helper functions for transactional memory context switching")
Cc: stable(a)vger.kernel.org # v3.9+
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Michael Neuling <mikey(a)neuling.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220311024733.48926-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3beecc32940b..5a0f023a26e9 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -443,7 +443,8 @@ restore_gprs:
REST_GPR(0, r7) /* GPR0 */
REST_GPRS(2, 4, r7) /* GPR2-4 */
- REST_GPRS(8, 31, r7) /* GPR8-31 */
+ REST_GPRS(8, 12, r7) /* GPR8-12 */
+ REST_GPRS(14, 31, r7) /* GPR14-31 */
/* Load up PPR and DSCR here so we don't run with user values for long */
mtspr SPRN_DSCR, r5
@@ -479,18 +480,24 @@ restore_gprs:
REST_GPR(6, r7)
/*
- * Store r1 and r5 on the stack so that we can access them after we
- * clear MSR RI.
+ * Store user r1 and r5 and r13 on the stack (in the unused save
+ * areas / compiler reserved areas), so that we can access them after
+ * we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
- ld r5, GPR1(r7)
+ ld r5, GPR13(r7)
std r5, -16(r1)
+ ld r5, GPR1(r7)
+ std r5, -24(r1)
REST_GPR(7, r7)
- /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
+ /* Stash the stack pointer away for use after recheckpoint */
+ std r1, PACAR1(r13)
+
+ /* Clear MSR RI since we are about to clobber r13. EE is already off */
li r5, 0
mtmsrd r5, 1
@@ -501,9 +508,9 @@ restore_gprs:
* until we turn MSR RI back on.
*/
- SET_SCRATCH0(r1)
ld r5, -8(r1)
- ld r1, -16(r1)
+ ld r13, -16(r1)
+ ld r1, -24(r1)
/* Commit register state as checkpointed state: */
TRECHKPT
@@ -519,9 +526,9 @@ restore_gprs:
*/
GET_PACA(r13)
- GET_SCRATCH0(r1)
+ ld r1, PACAR1(r13)
- /* R1 is restored, so we are recoverable again. EE is still off */
+ /* R13, R1 is restored, so we are recoverable again. EE is still off */
li r4, MSR_RI
mtmsrd r4, 1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d71165d3934e607070c4e48458c0cf161b1baea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 11 Mar 2022 12:47:33 +1000
Subject: [PATCH] powerpc/tm: Fix more userspace r13 corruption
Commit cf13435b730a ("powerpc/tm: Fix userspace r13 corruption") fixes a
problem in treclaim where a SLB miss can occur on the
thread_struct->ckpt_regs while SCRATCH0 is live with the saved user r13
value, clobbering it with the kernel r13 and ultimately resulting in
kernel r13 being stored in ckpt_regs.
There is an equivalent problem in trechkpt where the user r13 value is
loaded into r13 from chkpt_regs to be recheckpointed, but a SLB miss
could occur on ckpt_regs accesses after that, which will result in r13
being clobbered with a kernel value and that will get recheckpointed and
then restored to user registers.
The same memory page is accessed right before this critical window where
a SLB miss could cause corruption, so hitting the bug requires the SLB
entry be removed within a small window of instructions, which is
possible if a SLB related MCE hits there. PAPR also permits the
hypervisor to discard this SLB entry (because slb_shadow->persistent is
only set to SLB_NUM_BOLTED) although it's not known whether any
implementations would do this (KVM does not). So this is an extremely
unlikely bug, only found by inspection.
Fix this by also storing user r13 in a temporary location on the kernel
stack and don't change the r13 register from kernel r13 until the RI=0
critical section that does not fault.
The SCRATCH0 change is not strictly part of the fix, it's only used in
the RI=0 section so it does not have the same problem as the previous
SCRATCH0 bug.
Fixes: 98ae22e15b43 ("powerpc: Add helper functions for transactional memory context switching")
Cc: stable(a)vger.kernel.org # v3.9+
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Michael Neuling <mikey(a)neuling.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220311024733.48926-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3beecc32940b..5a0f023a26e9 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -443,7 +443,8 @@ restore_gprs:
REST_GPR(0, r7) /* GPR0 */
REST_GPRS(2, 4, r7) /* GPR2-4 */
- REST_GPRS(8, 31, r7) /* GPR8-31 */
+ REST_GPRS(8, 12, r7) /* GPR8-12 */
+ REST_GPRS(14, 31, r7) /* GPR14-31 */
/* Load up PPR and DSCR here so we don't run with user values for long */
mtspr SPRN_DSCR, r5
@@ -479,18 +480,24 @@ restore_gprs:
REST_GPR(6, r7)
/*
- * Store r1 and r5 on the stack so that we can access them after we
- * clear MSR RI.
+ * Store user r1 and r5 and r13 on the stack (in the unused save
+ * areas / compiler reserved areas), so that we can access them after
+ * we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
- ld r5, GPR1(r7)
+ ld r5, GPR13(r7)
std r5, -16(r1)
+ ld r5, GPR1(r7)
+ std r5, -24(r1)
REST_GPR(7, r7)
- /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
+ /* Stash the stack pointer away for use after recheckpoint */
+ std r1, PACAR1(r13)
+
+ /* Clear MSR RI since we are about to clobber r13. EE is already off */
li r5, 0
mtmsrd r5, 1
@@ -501,9 +508,9 @@ restore_gprs:
* until we turn MSR RI back on.
*/
- SET_SCRATCH0(r1)
ld r5, -8(r1)
- ld r1, -16(r1)
+ ld r13, -16(r1)
+ ld r1, -24(r1)
/* Commit register state as checkpointed state: */
TRECHKPT
@@ -519,9 +526,9 @@ restore_gprs:
*/
GET_PACA(r13)
- GET_SCRATCH0(r1)
+ ld r1, PACAR1(r13)
- /* R1 is restored, so we are recoverable again. EE is still off */
+ /* R13, R1 is restored, so we are recoverable again. EE is still off */
li r4, MSR_RI
mtmsrd r4, 1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d71165d3934e607070c4e48458c0cf161b1baea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 11 Mar 2022 12:47:33 +1000
Subject: [PATCH] powerpc/tm: Fix more userspace r13 corruption
Commit cf13435b730a ("powerpc/tm: Fix userspace r13 corruption") fixes a
problem in treclaim where a SLB miss can occur on the
thread_struct->ckpt_regs while SCRATCH0 is live with the saved user r13
value, clobbering it with the kernel r13 and ultimately resulting in
kernel r13 being stored in ckpt_regs.
There is an equivalent problem in trechkpt where the user r13 value is
loaded into r13 from chkpt_regs to be recheckpointed, but a SLB miss
could occur on ckpt_regs accesses after that, which will result in r13
being clobbered with a kernel value and that will get recheckpointed and
then restored to user registers.
The same memory page is accessed right before this critical window where
a SLB miss could cause corruption, so hitting the bug requires the SLB
entry be removed within a small window of instructions, which is
possible if a SLB related MCE hits there. PAPR also permits the
hypervisor to discard this SLB entry (because slb_shadow->persistent is
only set to SLB_NUM_BOLTED) although it's not known whether any
implementations would do this (KVM does not). So this is an extremely
unlikely bug, only found by inspection.
Fix this by also storing user r13 in a temporary location on the kernel
stack and don't change the r13 register from kernel r13 until the RI=0
critical section that does not fault.
The SCRATCH0 change is not strictly part of the fix, it's only used in
the RI=0 section so it does not have the same problem as the previous
SCRATCH0 bug.
Fixes: 98ae22e15b43 ("powerpc: Add helper functions for transactional memory context switching")
Cc: stable(a)vger.kernel.org # v3.9+
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Michael Neuling <mikey(a)neuling.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220311024733.48926-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3beecc32940b..5a0f023a26e9 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -443,7 +443,8 @@ restore_gprs:
REST_GPR(0, r7) /* GPR0 */
REST_GPRS(2, 4, r7) /* GPR2-4 */
- REST_GPRS(8, 31, r7) /* GPR8-31 */
+ REST_GPRS(8, 12, r7) /* GPR8-12 */
+ REST_GPRS(14, 31, r7) /* GPR14-31 */
/* Load up PPR and DSCR here so we don't run with user values for long */
mtspr SPRN_DSCR, r5
@@ -479,18 +480,24 @@ restore_gprs:
REST_GPR(6, r7)
/*
- * Store r1 and r5 on the stack so that we can access them after we
- * clear MSR RI.
+ * Store user r1 and r5 and r13 on the stack (in the unused save
+ * areas / compiler reserved areas), so that we can access them after
+ * we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
- ld r5, GPR1(r7)
+ ld r5, GPR13(r7)
std r5, -16(r1)
+ ld r5, GPR1(r7)
+ std r5, -24(r1)
REST_GPR(7, r7)
- /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
+ /* Stash the stack pointer away for use after recheckpoint */
+ std r1, PACAR1(r13)
+
+ /* Clear MSR RI since we are about to clobber r13. EE is already off */
li r5, 0
mtmsrd r5, 1
@@ -501,9 +508,9 @@ restore_gprs:
* until we turn MSR RI back on.
*/
- SET_SCRATCH0(r1)
ld r5, -8(r1)
- ld r1, -16(r1)
+ ld r13, -16(r1)
+ ld r1, -24(r1)
/* Commit register state as checkpointed state: */
TRECHKPT
@@ -519,9 +526,9 @@ restore_gprs:
*/
GET_PACA(r13)
- GET_SCRATCH0(r1)
+ ld r1, PACAR1(r13)
- /* R1 is restored, so we are recoverable again. EE is still off */
+ /* R13, R1 is restored, so we are recoverable again. EE is still off */
li r4, MSR_RI
mtmsrd r4, 1
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d71165d3934e607070c4e48458c0cf161b1baea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 11 Mar 2022 12:47:33 +1000
Subject: [PATCH] powerpc/tm: Fix more userspace r13 corruption
Commit cf13435b730a ("powerpc/tm: Fix userspace r13 corruption") fixes a
problem in treclaim where a SLB miss can occur on the
thread_struct->ckpt_regs while SCRATCH0 is live with the saved user r13
value, clobbering it with the kernel r13 and ultimately resulting in
kernel r13 being stored in ckpt_regs.
There is an equivalent problem in trechkpt where the user r13 value is
loaded into r13 from chkpt_regs to be recheckpointed, but a SLB miss
could occur on ckpt_regs accesses after that, which will result in r13
being clobbered with a kernel value and that will get recheckpointed and
then restored to user registers.
The same memory page is accessed right before this critical window where
a SLB miss could cause corruption, so hitting the bug requires the SLB
entry be removed within a small window of instructions, which is
possible if a SLB related MCE hits there. PAPR also permits the
hypervisor to discard this SLB entry (because slb_shadow->persistent is
only set to SLB_NUM_BOLTED) although it's not known whether any
implementations would do this (KVM does not). So this is an extremely
unlikely bug, only found by inspection.
Fix this by also storing user r13 in a temporary location on the kernel
stack and don't change the r13 register from kernel r13 until the RI=0
critical section that does not fault.
The SCRATCH0 change is not strictly part of the fix, it's only used in
the RI=0 section so it does not have the same problem as the previous
SCRATCH0 bug.
Fixes: 98ae22e15b43 ("powerpc: Add helper functions for transactional memory context switching")
Cc: stable(a)vger.kernel.org # v3.9+
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Michael Neuling <mikey(a)neuling.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220311024733.48926-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3beecc32940b..5a0f023a26e9 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -443,7 +443,8 @@ restore_gprs:
REST_GPR(0, r7) /* GPR0 */
REST_GPRS(2, 4, r7) /* GPR2-4 */
- REST_GPRS(8, 31, r7) /* GPR8-31 */
+ REST_GPRS(8, 12, r7) /* GPR8-12 */
+ REST_GPRS(14, 31, r7) /* GPR14-31 */
/* Load up PPR and DSCR here so we don't run with user values for long */
mtspr SPRN_DSCR, r5
@@ -479,18 +480,24 @@ restore_gprs:
REST_GPR(6, r7)
/*
- * Store r1 and r5 on the stack so that we can access them after we
- * clear MSR RI.
+ * Store user r1 and r5 and r13 on the stack (in the unused save
+ * areas / compiler reserved areas), so that we can access them after
+ * we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
- ld r5, GPR1(r7)
+ ld r5, GPR13(r7)
std r5, -16(r1)
+ ld r5, GPR1(r7)
+ std r5, -24(r1)
REST_GPR(7, r7)
- /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
+ /* Stash the stack pointer away for use after recheckpoint */
+ std r1, PACAR1(r13)
+
+ /* Clear MSR RI since we are about to clobber r13. EE is already off */
li r5, 0
mtmsrd r5, 1
@@ -501,9 +508,9 @@ restore_gprs:
* until we turn MSR RI back on.
*/
- SET_SCRATCH0(r1)
ld r5, -8(r1)
- ld r1, -16(r1)
+ ld r13, -16(r1)
+ ld r1, -24(r1)
/* Commit register state as checkpointed state: */
TRECHKPT
@@ -519,9 +526,9 @@ restore_gprs:
*/
GET_PACA(r13)
- GET_SCRATCH0(r1)
+ ld r1, PACAR1(r13)
- /* R1 is restored, so we are recoverable again. EE is still off */
+ /* R13, R1 is restored, so we are recoverable again. EE is still off */
li r4, MSR_RI
mtmsrd r4, 1
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d71165d3934e607070c4e48458c0cf161b1baea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin(a)gmail.com>
Date: Fri, 11 Mar 2022 12:47:33 +1000
Subject: [PATCH] powerpc/tm: Fix more userspace r13 corruption
Commit cf13435b730a ("powerpc/tm: Fix userspace r13 corruption") fixes a
problem in treclaim where a SLB miss can occur on the
thread_struct->ckpt_regs while SCRATCH0 is live with the saved user r13
value, clobbering it with the kernel r13 and ultimately resulting in
kernel r13 being stored in ckpt_regs.
There is an equivalent problem in trechkpt where the user r13 value is
loaded into r13 from chkpt_regs to be recheckpointed, but a SLB miss
could occur on ckpt_regs accesses after that, which will result in r13
being clobbered with a kernel value and that will get recheckpointed and
then restored to user registers.
The same memory page is accessed right before this critical window where
a SLB miss could cause corruption, so hitting the bug requires the SLB
entry be removed within a small window of instructions, which is
possible if a SLB related MCE hits there. PAPR also permits the
hypervisor to discard this SLB entry (because slb_shadow->persistent is
only set to SLB_NUM_BOLTED) although it's not known whether any
implementations would do this (KVM does not). So this is an extremely
unlikely bug, only found by inspection.
Fix this by also storing user r13 in a temporary location on the kernel
stack and don't change the r13 register from kernel r13 until the RI=0
critical section that does not fault.
The SCRATCH0 change is not strictly part of the fix, it's only used in
the RI=0 section so it does not have the same problem as the previous
SCRATCH0 bug.
Fixes: 98ae22e15b43 ("powerpc: Add helper functions for transactional memory context switching")
Cc: stable(a)vger.kernel.org # v3.9+
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Acked-by: Michael Neuling <mikey(a)neuling.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
Link: https://lore.kernel.org/r/20220311024733.48926-1-npiggin@gmail.com
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3beecc32940b..5a0f023a26e9 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -443,7 +443,8 @@ restore_gprs:
REST_GPR(0, r7) /* GPR0 */
REST_GPRS(2, 4, r7) /* GPR2-4 */
- REST_GPRS(8, 31, r7) /* GPR8-31 */
+ REST_GPRS(8, 12, r7) /* GPR8-12 */
+ REST_GPRS(14, 31, r7) /* GPR14-31 */
/* Load up PPR and DSCR here so we don't run with user values for long */
mtspr SPRN_DSCR, r5
@@ -479,18 +480,24 @@ restore_gprs:
REST_GPR(6, r7)
/*
- * Store r1 and r5 on the stack so that we can access them after we
- * clear MSR RI.
+ * Store user r1 and r5 and r13 on the stack (in the unused save
+ * areas / compiler reserved areas), so that we can access them after
+ * we clear MSR RI.
*/
REST_GPR(5, r7)
std r5, -8(r1)
- ld r5, GPR1(r7)
+ ld r5, GPR13(r7)
std r5, -16(r1)
+ ld r5, GPR1(r7)
+ std r5, -24(r1)
REST_GPR(7, r7)
- /* Clear MSR RI since we are about to use SCRATCH0. EE is already off */
+ /* Stash the stack pointer away for use after recheckpoint */
+ std r1, PACAR1(r13)
+
+ /* Clear MSR RI since we are about to clobber r13. EE is already off */
li r5, 0
mtmsrd r5, 1
@@ -501,9 +508,9 @@ restore_gprs:
* until we turn MSR RI back on.
*/
- SET_SCRATCH0(r1)
ld r5, -8(r1)
- ld r1, -16(r1)
+ ld r13, -16(r1)
+ ld r1, -24(r1)
/* Commit register state as checkpointed state: */
TRECHKPT
@@ -519,9 +526,9 @@ restore_gprs:
*/
GET_PACA(r13)
- GET_SCRATCH0(r1)
+ ld r1, PACAR1(r13)
- /* R1 is restored, so we are recoverable again. EE is still off */
+ /* R13, R1 is restored, so we are recoverable again. EE is still off */
li r4, MSR_RI
mtmsrd r4, 1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 83b83a02073ec8d18c77a9bbe0881d710f7a9d32 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 15 Dec 2021 01:15:54 +0000
Subject: [PATCH] KVM: x86/mmu: Use common TDP MMU zap helper for MMU notifier
unmap hook
Use the common TDP MMU zap helper when handling an MMU notifier unmap
event, the two flows are semantically identical. Consolidate the code in
preparation for a future bug fix, as both kvm_tdp_mmu_unmap_gfn_range()
and __kvm_tdp_mmu_zap_gfn_range() are guilty of not zapping SPTEs in
invalid roots.
No functional change intended.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20211215011557.399940-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index bc9e3553fba2..b9814e2b397f 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1032,13 +1032,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush)
{
- struct kvm_mmu_page *root;
-
- for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
- flush = zap_gfn_range(kvm, root, range->start, range->end,
- range->may_block, flush, false);
-
- return flush;
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
+ range->end, range->may_block, flush);
}
typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d6174299365ddbbf491620c0b8c5ca1a6ef2eea5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Wed, 9 Feb 2022 04:56:05 -0500
Subject: [PATCH] KVM: x86: Reinitialize context if host userspace toggles
EFER.LME
While the guest runs, EFER.LME cannot change unless CR0.PG is clear, and
therefore EFER.NX is the only bit that can affect the MMU role. However,
set_efer accepts a host-initiated change to EFER.LME even with CR0.PG=1.
In that case, the MMU has to be reset.
Fixes: 11988499e62b ("KVM: x86: Skip EFER vs. guest CPUID checks for host-initiated writes")
Cc: stable(a)vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 51faa2c76ca5..a5a50cfeffff 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,7 @@
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
static __always_inline u64 rsvd_bits(int s, int e)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b95c379e234..b724273493d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1648,8 +1648,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d6174299365ddbbf491620c0b8c5ca1a6ef2eea5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Wed, 9 Feb 2022 04:56:05 -0500
Subject: [PATCH] KVM: x86: Reinitialize context if host userspace toggles
EFER.LME
While the guest runs, EFER.LME cannot change unless CR0.PG is clear, and
therefore EFER.NX is the only bit that can affect the MMU role. However,
set_efer accepts a host-initiated change to EFER.LME even with CR0.PG=1.
In that case, the MMU has to be reset.
Fixes: 11988499e62b ("KVM: x86: Skip EFER vs. guest CPUID checks for host-initiated writes")
Cc: stable(a)vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 51faa2c76ca5..a5a50cfeffff 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,7 @@
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
static __always_inline u64 rsvd_bits(int s, int e)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b95c379e234..b724273493d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1648,8 +1648,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d6174299365ddbbf491620c0b8c5ca1a6ef2eea5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Wed, 9 Feb 2022 04:56:05 -0500
Subject: [PATCH] KVM: x86: Reinitialize context if host userspace toggles
EFER.LME
While the guest runs, EFER.LME cannot change unless CR0.PG is clear, and
therefore EFER.NX is the only bit that can affect the MMU role. However,
set_efer accepts a host-initiated change to EFER.LME even with CR0.PG=1.
In that case, the MMU has to be reset.
Fixes: 11988499e62b ("KVM: x86: Skip EFER vs. guest CPUID checks for host-initiated writes")
Cc: stable(a)vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 51faa2c76ca5..a5a50cfeffff 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,7 @@
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
static __always_inline u64 rsvd_bits(int s, int e)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b95c379e234..b724273493d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1648,8 +1648,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d6174299365ddbbf491620c0b8c5ca1a6ef2eea5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Wed, 9 Feb 2022 04:56:05 -0500
Subject: [PATCH] KVM: x86: Reinitialize context if host userspace toggles
EFER.LME
While the guest runs, EFER.LME cannot change unless CR0.PG is clear, and
therefore EFER.NX is the only bit that can affect the MMU role. However,
set_efer accepts a host-initiated change to EFER.LME even with CR0.PG=1.
In that case, the MMU has to be reset.
Fixes: 11988499e62b ("KVM: x86: Skip EFER vs. guest CPUID checks for host-initiated writes")
Cc: stable(a)vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 51faa2c76ca5..a5a50cfeffff 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,7 @@
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
static __always_inline u64 rsvd_bits(int s, int e)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b95c379e234..b724273493d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1648,8 +1648,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d6174299365ddbbf491620c0b8c5ca1a6ef2eea5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini(a)redhat.com>
Date: Wed, 9 Feb 2022 04:56:05 -0500
Subject: [PATCH] KVM: x86: Reinitialize context if host userspace toggles
EFER.LME
While the guest runs, EFER.LME cannot change unless CR0.PG is clear, and
therefore EFER.NX is the only bit that can affect the MMU role. However,
set_efer accepts a host-initiated change to EFER.LME even with CR0.PG=1.
In that case, the MMU has to be reset.
Fixes: 11988499e62b ("KVM: x86: Skip EFER vs. guest CPUID checks for host-initiated writes")
Cc: stable(a)vger.kernel.org
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 51faa2c76ca5..a5a50cfeffff 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,6 +48,7 @@
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
+#define KVM_MMU_EFER_ROLE_BITS (EFER_LME | EFER_NX)
static __always_inline u64 rsvd_bits(int s, int e)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b95c379e234..b724273493d8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1648,8 +1648,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return r;
}
- /* Update reserved bits */
- if ((efer ^ old_efer) & EFER_NX)
+ if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
return 0;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a204f7895878363ca8211f50ec610408c8c70aa Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Date: Thu, 10 Feb 2022 18:08:51 -0600
Subject: [PATCH] KVM: SVM: Allow AVIC support on system w/ physical APIC ID >
255
Expand KVM's mask for the AVIC host physical ID to the full 12 bits defined
by the architecture. The number of bits consumed by hardware is model
specific, e.g. early CPUs ignored bits 11:8, but there is no way for KVM
to enumerate the "true" size. So, KVM must allow using all bits, else it
risks rejecting completely legal x2APIC IDs on newer CPUs.
This means KVM relies on hardware to not assign x2APIC IDs that exceed the
"true" width of the field, but presumably hardware is smart enough to tie
the width to the max x2APIC ID. KVM also relies on hardware to support at
least 8 bits, as the legacy xAPIC ID is writable by software. But, those
assumptions are unavoidable due to the lack of any way to enumerate the
"true" width.
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Message-Id: <20220211000851.185799-1-suravee.suthikulpanit(a)amd.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bb2fb78523ce..7eb2df5417fb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -226,7 +226,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1afde44b1252..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -870,17 +870,12 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -569,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a204f7895878363ca8211f50ec610408c8c70aa Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Date: Thu, 10 Feb 2022 18:08:51 -0600
Subject: [PATCH] KVM: SVM: Allow AVIC support on system w/ physical APIC ID >
255
Expand KVM's mask for the AVIC host physical ID to the full 12 bits defined
by the architecture. The number of bits consumed by hardware is model
specific, e.g. early CPUs ignored bits 11:8, but there is no way for KVM
to enumerate the "true" size. So, KVM must allow using all bits, else it
risks rejecting completely legal x2APIC IDs on newer CPUs.
This means KVM relies on hardware to not assign x2APIC IDs that exceed the
"true" width of the field, but presumably hardware is smart enough to tie
the width to the max x2APIC ID. KVM also relies on hardware to support at
least 8 bits, as the legacy xAPIC ID is writable by software. But, those
assumptions are unavoidable due to the lack of any way to enumerate the
"true" width.
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Message-Id: <20220211000851.185799-1-suravee.suthikulpanit(a)amd.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bb2fb78523ce..7eb2df5417fb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -226,7 +226,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1afde44b1252..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -870,17 +870,12 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -569,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a204f7895878363ca8211f50ec610408c8c70aa Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Date: Thu, 10 Feb 2022 18:08:51 -0600
Subject: [PATCH] KVM: SVM: Allow AVIC support on system w/ physical APIC ID >
255
Expand KVM's mask for the AVIC host physical ID to the full 12 bits defined
by the architecture. The number of bits consumed by hardware is model
specific, e.g. early CPUs ignored bits 11:8, but there is no way for KVM
to enumerate the "true" size. So, KVM must allow using all bits, else it
risks rejecting completely legal x2APIC IDs on newer CPUs.
This means KVM relies on hardware to not assign x2APIC IDs that exceed the
"true" width of the field, but presumably hardware is smart enough to tie
the width to the max x2APIC ID. KVM also relies on hardware to support at
least 8 bits, as the legacy xAPIC ID is writable by software. But, those
assumptions are unavoidable due to the lack of any way to enumerate the
"true" width.
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Message-Id: <20220211000851.185799-1-suravee.suthikulpanit(a)amd.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bb2fb78523ce..7eb2df5417fb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -226,7 +226,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1afde44b1252..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -870,17 +870,12 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -569,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a204f7895878363ca8211f50ec610408c8c70aa Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Date: Thu, 10 Feb 2022 18:08:51 -0600
Subject: [PATCH] KVM: SVM: Allow AVIC support on system w/ physical APIC ID >
255
Expand KVM's mask for the AVIC host physical ID to the full 12 bits defined
by the architecture. The number of bits consumed by hardware is model
specific, e.g. early CPUs ignored bits 11:8, but there is no way for KVM
to enumerate the "true" size. So, KVM must allow using all bits, else it
risks rejecting completely legal x2APIC IDs on newer CPUs.
This means KVM relies on hardware to not assign x2APIC IDs that exceed the
"true" width of the field, but presumably hardware is smart enough to tie
the width to the max x2APIC ID. KVM also relies on hardware to support at
least 8 bits, as the legacy xAPIC ID is writable by software. But, those
assumptions are unavoidable due to the lack of any way to enumerate the
"true" width.
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Message-Id: <20220211000851.185799-1-suravee.suthikulpanit(a)amd.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bb2fb78523ce..7eb2df5417fb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -226,7 +226,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1afde44b1252..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -870,17 +870,12 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -569,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a204f7895878363ca8211f50ec610408c8c70aa Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Date: Thu, 10 Feb 2022 18:08:51 -0600
Subject: [PATCH] KVM: SVM: Allow AVIC support on system w/ physical APIC ID >
255
Expand KVM's mask for the AVIC host physical ID to the full 12 bits defined
by the architecture. The number of bits consumed by hardware is model
specific, e.g. early CPUs ignored bits 11:8, but there is no way for KVM
to enumerate the "true" size. So, KVM must allow using all bits, else it
risks rejecting completely legal x2APIC IDs on newer CPUs.
This means KVM relies on hardware to not assign x2APIC IDs that exceed the
"true" width of the field, but presumably hardware is smart enough to tie
the width to the max x2APIC ID. KVM also relies on hardware to support at
least 8 bits, as the legacy xAPIC ID is writable by software. But, those
assumptions are unavoidable due to the lack of any way to enumerate the
"true" width.
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Message-Id: <20220211000851.185799-1-suravee.suthikulpanit(a)amd.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bb2fb78523ce..7eb2df5417fb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -226,7 +226,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1afde44b1252..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -870,17 +870,12 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -569,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a204f7895878363ca8211f50ec610408c8c70aa Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Date: Thu, 10 Feb 2022 18:08:51 -0600
Subject: [PATCH] KVM: SVM: Allow AVIC support on system w/ physical APIC ID >
255
Expand KVM's mask for the AVIC host physical ID to the full 12 bits defined
by the architecture. The number of bits consumed by hardware is model
specific, e.g. early CPUs ignored bits 11:8, but there is no way for KVM
to enumerate the "true" size. So, KVM must allow using all bits, else it
risks rejecting completely legal x2APIC IDs on newer CPUs.
This means KVM relies on hardware to not assign x2APIC IDs that exceed the
"true" width of the field, but presumably hardware is smart enough to tie
the width to the max x2APIC ID. KVM also relies on hardware to support at
least 8 bits, as the legacy xAPIC ID is writable by software. But, those
assumptions are unavoidable due to the lack of any way to enumerate the
"true" width.
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Message-Id: <20220211000851.185799-1-suravee.suthikulpanit(a)amd.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bb2fb78523ce..7eb2df5417fb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -226,7 +226,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1afde44b1252..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -870,17 +870,12 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -569,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4a204f7895878363ca8211f50ec610408c8c70aa Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Date: Thu, 10 Feb 2022 18:08:51 -0600
Subject: [PATCH] KVM: SVM: Allow AVIC support on system w/ physical APIC ID >
255
Expand KVM's mask for the AVIC host physical ID to the full 12 bits defined
by the architecture. The number of bits consumed by hardware is model
specific, e.g. early CPUs ignored bits 11:8, but there is no way for KVM
to enumerate the "true" size. So, KVM must allow using all bits, else it
risks rejecting completely legal x2APIC IDs on newer CPUs.
This means KVM relies on hardware to not assign x2APIC IDs that exceed the
"true" width of the field, but presumably hardware is smart enough to tie
the width to the max x2APIC ID. KVM also relies on hardware to support at
least 8 bits, as the legacy xAPIC ID is writable by software. But, those
assumptions are unavoidable due to the lack of any way to enumerate the
"true" width.
Cc: stable(a)vger.kernel.org
Cc: Maxim Levitsky <mlevitsk(a)redhat.com>
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit(a)amd.com>
Message-Id: <20220211000851.185799-1-suravee.suthikulpanit(a)amd.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bb2fb78523ce..7eb2df5417fb 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -226,7 +226,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
-#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 1afde44b1252..b37b353ec086 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -870,17 +870,12 @@ avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
lockdep_assert_preemption_disabled();
- /*
- * Since the host physical APIC id is 8 bits,
- * we can support host APIC ID upto 255.
- */
- if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
+ if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e45b5645d5e0..e37bb3508cfa 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -569,6 +569,17 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
+#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
+#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
+
+#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK GENMASK_ULL(11, 0)
+#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12)
+#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62)
+#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63)
+
+#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
+
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 24a644ebbfd3b13cda702f98907f9dd123e34bf9 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Thu, 10 Feb 2022 12:36:42 +0200
Subject: [PATCH] drm/i915/opregion: check port number bounds for SWSCI display
power state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The mapping from enum port to whatever port numbering scheme is used by
the SWSCI Display Power State Notification is odd, and the memory of it
has faded. In any case, the parameter only has space for ports numbered
[0..4], and UBSAN reports bit shift beyond it when the platform has port
F or more.
Since the SWSCI functionality is supposed to be obsolete for new
platforms (i.e. ones that might have port F or more), just bail out
early if the mapped and mangled port number is beyond what the Display
Power State Notification can support.
Fixes: 9c4b0a683193 ("drm/i915: add opregion function to notify bios of encoder enable/disable")
Cc: <stable(a)vger.kernel.org> # v3.13+
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: Lucas De Marchi <lucas.demarchi(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4800
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/cc363f42d6b5a5932b6d218fefcc8…
diff --git a/drivers/gpu/drm/i915/display/intel_opregion.c b/drivers/gpu/drm/i915/display/intel_opregion.c
index af9d30f56cc1..ad1afe9df6c3 100644
--- a/drivers/gpu/drm/i915/display/intel_opregion.c
+++ b/drivers/gpu/drm/i915/display/intel_opregion.c
@@ -363,6 +363,21 @@ int intel_opregion_notify_encoder(struct intel_encoder *intel_encoder,
port++;
}
+ /*
+ * The port numbering and mapping here is bizarre. The now-obsolete
+ * swsci spec supports ports numbered [0..4]. Port E is handled as a
+ * special case, but port F and beyond are not. The functionality is
+ * supposed to be obsolete for new platforms. Just bail out if the port
+ * number is out of bounds after mapping.
+ */
+ if (port > 4) {
+ drm_dbg_kms(&dev_priv->drm,
+ "[ENCODER:%d:%s] port %c (index %u) out of bounds for display power state notification\n",
+ intel_encoder->base.base.id, intel_encoder->base.name,
+ port_name(intel_encoder->port), port);
+ return -EINVAL;
+ }
+
if (!enable)
parm |= 4 << 8;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 24a644ebbfd3b13cda702f98907f9dd123e34bf9 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Thu, 10 Feb 2022 12:36:42 +0200
Subject: [PATCH] drm/i915/opregion: check port number bounds for SWSCI display
power state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The mapping from enum port to whatever port numbering scheme is used by
the SWSCI Display Power State Notification is odd, and the memory of it
has faded. In any case, the parameter only has space for ports numbered
[0..4], and UBSAN reports bit shift beyond it when the platform has port
F or more.
Since the SWSCI functionality is supposed to be obsolete for new
platforms (i.e. ones that might have port F or more), just bail out
early if the mapped and mangled port number is beyond what the Display
Power State Notification can support.
Fixes: 9c4b0a683193 ("drm/i915: add opregion function to notify bios of encoder enable/disable")
Cc: <stable(a)vger.kernel.org> # v3.13+
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: Lucas De Marchi <lucas.demarchi(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4800
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/cc363f42d6b5a5932b6d218fefcc8…
diff --git a/drivers/gpu/drm/i915/display/intel_opregion.c b/drivers/gpu/drm/i915/display/intel_opregion.c
index af9d30f56cc1..ad1afe9df6c3 100644
--- a/drivers/gpu/drm/i915/display/intel_opregion.c
+++ b/drivers/gpu/drm/i915/display/intel_opregion.c
@@ -363,6 +363,21 @@ int intel_opregion_notify_encoder(struct intel_encoder *intel_encoder,
port++;
}
+ /*
+ * The port numbering and mapping here is bizarre. The now-obsolete
+ * swsci spec supports ports numbered [0..4]. Port E is handled as a
+ * special case, but port F and beyond are not. The functionality is
+ * supposed to be obsolete for new platforms. Just bail out if the port
+ * number is out of bounds after mapping.
+ */
+ if (port > 4) {
+ drm_dbg_kms(&dev_priv->drm,
+ "[ENCODER:%d:%s] port %c (index %u) out of bounds for display power state notification\n",
+ intel_encoder->base.base.id, intel_encoder->base.name,
+ port_name(intel_encoder->port), port);
+ return -EINVAL;
+ }
+
if (!enable)
parm |= 4 << 8;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 24a644ebbfd3b13cda702f98907f9dd123e34bf9 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Thu, 10 Feb 2022 12:36:42 +0200
Subject: [PATCH] drm/i915/opregion: check port number bounds for SWSCI display
power state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The mapping from enum port to whatever port numbering scheme is used by
the SWSCI Display Power State Notification is odd, and the memory of it
has faded. In any case, the parameter only has space for ports numbered
[0..4], and UBSAN reports bit shift beyond it when the platform has port
F or more.
Since the SWSCI functionality is supposed to be obsolete for new
platforms (i.e. ones that might have port F or more), just bail out
early if the mapped and mangled port number is beyond what the Display
Power State Notification can support.
Fixes: 9c4b0a683193 ("drm/i915: add opregion function to notify bios of encoder enable/disable")
Cc: <stable(a)vger.kernel.org> # v3.13+
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: Lucas De Marchi <lucas.demarchi(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4800
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/cc363f42d6b5a5932b6d218fefcc8…
diff --git a/drivers/gpu/drm/i915/display/intel_opregion.c b/drivers/gpu/drm/i915/display/intel_opregion.c
index af9d30f56cc1..ad1afe9df6c3 100644
--- a/drivers/gpu/drm/i915/display/intel_opregion.c
+++ b/drivers/gpu/drm/i915/display/intel_opregion.c
@@ -363,6 +363,21 @@ int intel_opregion_notify_encoder(struct intel_encoder *intel_encoder,
port++;
}
+ /*
+ * The port numbering and mapping here is bizarre. The now-obsolete
+ * swsci spec supports ports numbered [0..4]. Port E is handled as a
+ * special case, but port F and beyond are not. The functionality is
+ * supposed to be obsolete for new platforms. Just bail out if the port
+ * number is out of bounds after mapping.
+ */
+ if (port > 4) {
+ drm_dbg_kms(&dev_priv->drm,
+ "[ENCODER:%d:%s] port %c (index %u) out of bounds for display power state notification\n",
+ intel_encoder->base.base.id, intel_encoder->base.name,
+ port_name(intel_encoder->port), port);
+ return -EINVAL;
+ }
+
if (!enable)
parm |= 4 << 8;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 24a644ebbfd3b13cda702f98907f9dd123e34bf9 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Thu, 10 Feb 2022 12:36:42 +0200
Subject: [PATCH] drm/i915/opregion: check port number bounds for SWSCI display
power state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The mapping from enum port to whatever port numbering scheme is used by
the SWSCI Display Power State Notification is odd, and the memory of it
has faded. In any case, the parameter only has space for ports numbered
[0..4], and UBSAN reports bit shift beyond it when the platform has port
F or more.
Since the SWSCI functionality is supposed to be obsolete for new
platforms (i.e. ones that might have port F or more), just bail out
early if the mapped and mangled port number is beyond what the Display
Power State Notification can support.
Fixes: 9c4b0a683193 ("drm/i915: add opregion function to notify bios of encoder enable/disable")
Cc: <stable(a)vger.kernel.org> # v3.13+
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Cc: Lucas De Marchi <lucas.demarchi(a)intel.com>
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4800
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/cc363f42d6b5a5932b6d218fefcc8…
diff --git a/drivers/gpu/drm/i915/display/intel_opregion.c b/drivers/gpu/drm/i915/display/intel_opregion.c
index af9d30f56cc1..ad1afe9df6c3 100644
--- a/drivers/gpu/drm/i915/display/intel_opregion.c
+++ b/drivers/gpu/drm/i915/display/intel_opregion.c
@@ -363,6 +363,21 @@ int intel_opregion_notify_encoder(struct intel_encoder *intel_encoder,
port++;
}
+ /*
+ * The port numbering and mapping here is bizarre. The now-obsolete
+ * swsci spec supports ports numbered [0..4]. Port E is handled as a
+ * special case, but port F and beyond are not. The functionality is
+ * supposed to be obsolete for new platforms. Just bail out if the port
+ * number is out of bounds after mapping.
+ */
+ if (port > 4) {
+ drm_dbg_kms(&dev_priv->drm,
+ "[ENCODER:%d:%s] port %c (index %u) out of bounds for display power state notification\n",
+ intel_encoder->base.base.id, intel_encoder->base.name,
+ port_name(intel_encoder->port), port);
+ return -EINVAL;
+ }
+
if (!enable)
parm |= 4 << 8;
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 32370191c0851da069d242f581cbe2fdb80040cb Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello(a)amd.com>
Date: Wed, 23 Feb 2022 11:52:37 -0600
Subject: [PATCH] platform/x86: amd-pmc: Set QOS during suspend on CZN w/ timer
wakeup
commit 59348401ebed ("platform/x86: amd-pmc: Add special handling for
timer based S0i3 wakeup") adds support for using another platform timer
in lieu of the RTC which doesn't work properly on some systems. This path
was validated and worked well before submission. During the 5.16-rc1 merge
window other patches were merged that caused this to stop working properly.
When this feature was used with 5.16-rc1 or later some OEM laptops with the
matching firmware requirements from that commit would shutdown instead of
program a timer based wakeup.
This was bisected to commit 8d89835b0467 ("PM: suspend: Do not pause
cpuidle in the suspend-to-idle path"). This wasn't supposed to cause any
negative impacts and also tested well on both Intel and ARM platforms.
However this changed the semantics of when CPUs are allowed to be in the
deepest state. For the AMD systems in question it appears this causes a
firmware crash for timer based wakeup.
It's hypothesized to be caused by the `amd-pmc` driver sending `OS_HINT`
and all the CPUs going into a deep state while the timer is still being
programmed. It's likely a firmware bug, but to avoid it don't allow setting
CPUs into the deepest state while using CZN timer wakeup path.
If later it's discovered that this also occurs from "regular" suspends
without a timer as well or on other silicon, this may be later expanded to
run in the suspend path for more scenarios.
Cc: stable(a)vger.kernel.org # 5.16+
Suggested-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Link: https://lore.kernel.org/linux-acpi/BL1PR12MB51570F5BD05980A0DCA1F3F4E23A9@B…
Fixes: 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path")
Fixes: 23f62d7ab25b ("PM: sleep: Pause cpuidle later and resume it earlier during system transitions")
Fixes: 59348401ebed ("platform/x86: amd-pmc: Add special handling for timer based S0i3 wakeup")
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Link: https://lore.kernel.org/r/20220223175237.6209-1-mario.limonciello@amd.com
Reviewed-by: Hans de Goede <hdegoede(a)redhat.com>
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c
index 69fdbb0d9f45..425a86108f75 100644
--- a/drivers/platform/x86/amd-pmc.c
+++ b/drivers/platform/x86/amd-pmc.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
+#include <linux/pm_qos.h>
#include <linux/rtc.h>
#include <linux/suspend.h>
#include <linux/seq_file.h>
@@ -95,6 +96,9 @@
#define PMC_MSG_DELAY_MIN_US 50
#define RESPONSE_REGISTER_LOOP_MAX 20000
+/* QoS request for letting CPUs in idle states, but not the deepest */
+#define AMD_PMC_MAX_IDLE_STATE_LATENCY 3
+
#define SOC_SUBSYSTEM_IP_MAX 12
#define DELAY_MIN_US 2000
#define DELAY_MAX_US 3000
@@ -149,6 +153,7 @@ struct amd_pmc_dev {
struct device *dev;
struct pci_dev *rdev;
struct mutex lock; /* generic mutex lock */
+ struct pm_qos_request amd_pmc_pm_qos_req;
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *dbgfs_dir;
#endif /* CONFIG_DEBUG_FS */
@@ -603,6 +608,14 @@ static int amd_pmc_verify_czn_rtc(struct amd_pmc_dev *pdev, u32 *arg)
rc = rtc_alarm_irq_enable(rtc_device, 0);
dev_dbg(pdev->dev, "wakeup timer programmed for %lld seconds\n", duration);
+ /*
+ * Prevent CPUs from getting into deep idle states while sending OS_HINT
+ * which is otherwise generally safe to send when at least one of the CPUs
+ * is not in deep idle states.
+ */
+ cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req, AMD_PMC_MAX_IDLE_STATE_LATENCY);
+ wake_up_all_idle_cpus();
+
return rc;
}
@@ -620,24 +633,31 @@ static int __maybe_unused amd_pmc_suspend(struct device *dev)
/* Activate CZN specific RTC functionality */
if (pdev->cpu_id == AMD_CPU_ID_CZN) {
rc = amd_pmc_verify_czn_rtc(pdev, &arg);
- if (rc < 0)
- return rc;
+ if (rc)
+ goto fail;
}
/* Dump the IdleMask before we send hint to SMU */
amd_pmc_idlemask_read(pdev, dev, NULL);
msg = amd_pmc_get_os_hint(pdev);
rc = amd_pmc_send_cmd(pdev, arg, NULL, msg, 0);
- if (rc)
+ if (rc) {
dev_err(pdev->dev, "suspend failed\n");
+ goto fail;
+ }
if (enable_stb)
rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF);
- if (rc) {
+ if (rc) {
dev_err(pdev->dev, "error writing to STB\n");
- return rc;
+ goto fail;
}
+ return 0;
+fail:
+ if (pdev->cpu_id == AMD_CPU_ID_CZN)
+ cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req,
+ PM_QOS_DEFAULT_VALUE);
return rc;
}
@@ -661,12 +681,15 @@ static int __maybe_unused amd_pmc_resume(struct device *dev)
/* Write data incremented by 1 to distinguish in stb_read */
if (enable_stb)
rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF + 1);
- if (rc) {
+ if (rc)
dev_err(pdev->dev, "error writing to STB\n");
- return rc;
- }
- return 0;
+ /* Restore the QoS request back to defaults if it was set */
+ if (pdev->cpu_id == AMD_CPU_ID_CZN)
+ cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req,
+ PM_QOS_DEFAULT_VALUE);
+
+ return rc;
}
static const struct dev_pm_ops amd_pmc_pm_ops = {
@@ -838,6 +861,7 @@ static int amd_pmc_probe(struct platform_device *pdev)
amd_pmc_get_smu_version(dev);
platform_set_drvdata(pdev, dev);
amd_pmc_dbgfs_register(dev);
+ cpu_latency_qos_add_request(&dev->amd_pmc_pm_qos_req, PM_QOS_DEFAULT_VALUE);
return 0;
err_pci_dev_put:
The patch below does not apply to the 5.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 32370191c0851da069d242f581cbe2fdb80040cb Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello(a)amd.com>
Date: Wed, 23 Feb 2022 11:52:37 -0600
Subject: [PATCH] platform/x86: amd-pmc: Set QOS during suspend on CZN w/ timer
wakeup
commit 59348401ebed ("platform/x86: amd-pmc: Add special handling for
timer based S0i3 wakeup") adds support for using another platform timer
in lieu of the RTC which doesn't work properly on some systems. This path
was validated and worked well before submission. During the 5.16-rc1 merge
window other patches were merged that caused this to stop working properly.
When this feature was used with 5.16-rc1 or later some OEM laptops with the
matching firmware requirements from that commit would shutdown instead of
program a timer based wakeup.
This was bisected to commit 8d89835b0467 ("PM: suspend: Do not pause
cpuidle in the suspend-to-idle path"). This wasn't supposed to cause any
negative impacts and also tested well on both Intel and ARM platforms.
However this changed the semantics of when CPUs are allowed to be in the
deepest state. For the AMD systems in question it appears this causes a
firmware crash for timer based wakeup.
It's hypothesized to be caused by the `amd-pmc` driver sending `OS_HINT`
and all the CPUs going into a deep state while the timer is still being
programmed. It's likely a firmware bug, but to avoid it don't allow setting
CPUs into the deepest state while using CZN timer wakeup path.
If later it's discovered that this also occurs from "regular" suspends
without a timer as well or on other silicon, this may be later expanded to
run in the suspend path for more scenarios.
Cc: stable(a)vger.kernel.org # 5.16+
Suggested-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Link: https://lore.kernel.org/linux-acpi/BL1PR12MB51570F5BD05980A0DCA1F3F4E23A9@B…
Fixes: 8d89835b0467 ("PM: suspend: Do not pause cpuidle in the suspend-to-idle path")
Fixes: 23f62d7ab25b ("PM: sleep: Pause cpuidle later and resume it earlier during system transitions")
Fixes: 59348401ebed ("platform/x86: amd-pmc: Add special handling for timer based S0i3 wakeup")
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Link: https://lore.kernel.org/r/20220223175237.6209-1-mario.limonciello@amd.com
Reviewed-by: Hans de Goede <hdegoede(a)redhat.com>
Signed-off-by: Hans de Goede <hdegoede(a)redhat.com>
diff --git a/drivers/platform/x86/amd-pmc.c b/drivers/platform/x86/amd-pmc.c
index 69fdbb0d9f45..425a86108f75 100644
--- a/drivers/platform/x86/amd-pmc.c
+++ b/drivers/platform/x86/amd-pmc.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
+#include <linux/pm_qos.h>
#include <linux/rtc.h>
#include <linux/suspend.h>
#include <linux/seq_file.h>
@@ -95,6 +96,9 @@
#define PMC_MSG_DELAY_MIN_US 50
#define RESPONSE_REGISTER_LOOP_MAX 20000
+/* QoS request for letting CPUs in idle states, but not the deepest */
+#define AMD_PMC_MAX_IDLE_STATE_LATENCY 3
+
#define SOC_SUBSYSTEM_IP_MAX 12
#define DELAY_MIN_US 2000
#define DELAY_MAX_US 3000
@@ -149,6 +153,7 @@ struct amd_pmc_dev {
struct device *dev;
struct pci_dev *rdev;
struct mutex lock; /* generic mutex lock */
+ struct pm_qos_request amd_pmc_pm_qos_req;
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *dbgfs_dir;
#endif /* CONFIG_DEBUG_FS */
@@ -603,6 +608,14 @@ static int amd_pmc_verify_czn_rtc(struct amd_pmc_dev *pdev, u32 *arg)
rc = rtc_alarm_irq_enable(rtc_device, 0);
dev_dbg(pdev->dev, "wakeup timer programmed for %lld seconds\n", duration);
+ /*
+ * Prevent CPUs from getting into deep idle states while sending OS_HINT
+ * which is otherwise generally safe to send when at least one of the CPUs
+ * is not in deep idle states.
+ */
+ cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req, AMD_PMC_MAX_IDLE_STATE_LATENCY);
+ wake_up_all_idle_cpus();
+
return rc;
}
@@ -620,24 +633,31 @@ static int __maybe_unused amd_pmc_suspend(struct device *dev)
/* Activate CZN specific RTC functionality */
if (pdev->cpu_id == AMD_CPU_ID_CZN) {
rc = amd_pmc_verify_czn_rtc(pdev, &arg);
- if (rc < 0)
- return rc;
+ if (rc)
+ goto fail;
}
/* Dump the IdleMask before we send hint to SMU */
amd_pmc_idlemask_read(pdev, dev, NULL);
msg = amd_pmc_get_os_hint(pdev);
rc = amd_pmc_send_cmd(pdev, arg, NULL, msg, 0);
- if (rc)
+ if (rc) {
dev_err(pdev->dev, "suspend failed\n");
+ goto fail;
+ }
if (enable_stb)
rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF);
- if (rc) {
+ if (rc) {
dev_err(pdev->dev, "error writing to STB\n");
- return rc;
+ goto fail;
}
+ return 0;
+fail:
+ if (pdev->cpu_id == AMD_CPU_ID_CZN)
+ cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req,
+ PM_QOS_DEFAULT_VALUE);
return rc;
}
@@ -661,12 +681,15 @@ static int __maybe_unused amd_pmc_resume(struct device *dev)
/* Write data incremented by 1 to distinguish in stb_read */
if (enable_stb)
rc = amd_pmc_write_stb(pdev, AMD_PMC_STB_PREDEF + 1);
- if (rc) {
+ if (rc)
dev_err(pdev->dev, "error writing to STB\n");
- return rc;
- }
- return 0;
+ /* Restore the QoS request back to defaults if it was set */
+ if (pdev->cpu_id == AMD_CPU_ID_CZN)
+ cpu_latency_qos_update_request(&pdev->amd_pmc_pm_qos_req,
+ PM_QOS_DEFAULT_VALUE);
+
+ return rc;
}
static const struct dev_pm_ops amd_pmc_pm_ops = {
@@ -838,6 +861,7 @@ static int amd_pmc_probe(struct platform_device *pdev)
amd_pmc_get_smu_version(dev);
platform_set_drvdata(pdev, dev);
amd_pmc_dbgfs_register(dev);
+ cpu_latency_qos_add_request(&dev->amd_pmc_pm_qos_req, PM_QOS_DEFAULT_VALUE);
return 0;
err_pci_dev_put:
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b50255c83b914defd61a57fbc81d452334b63f4c Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan(a)marcan.st>
Date: Tue, 1 Feb 2022 01:07:10 +0900
Subject: [PATCH] brcmfmac: pcie: Fix crashes due to early IRQs
The driver was enabling IRQs before the message processing was
initialized. This could cause IRQs to come in too early and crash the
driver. Instead, move the IRQ enable and hostready to a bus preinit
function, at which point everything is properly initialized.
Fixes: 9e37f045d5e7 ("brcmfmac: Adding PCIe bus layer support.")
Reviewed-by: Linus Walleij <linus.walleij(a)linaro.org>
Reviewed-by: Arend van Spriel <arend.vanspriel(a)broadcom.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Hector Martin <marcan(a)marcan.st>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Signed-off-by: Kalle Valo <kvalo(a)kernel.org>
Link: https://lore.kernel.org/r/20220131160713.245637-7-marcan@marcan.st
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 3f3ca7612bcd..55f0111283c9 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -1315,6 +1315,18 @@ static void brcmf_pcie_down(struct device *dev)
{
}
+static int brcmf_pcie_preinit(struct device *dev)
+{
+ struct brcmf_bus *bus_if = dev_get_drvdata(dev);
+ struct brcmf_pciedev *buspub = bus_if->bus_priv.pcie;
+
+ brcmf_dbg(PCIE, "Enter\n");
+
+ brcmf_pcie_intr_enable(buspub->devinfo);
+ brcmf_pcie_hostready(buspub->devinfo);
+
+ return 0;
+}
static int brcmf_pcie_tx(struct device *dev, struct sk_buff *skb)
{
@@ -1423,6 +1435,7 @@ static int brcmf_pcie_reset(struct device *dev)
}
static const struct brcmf_bus_ops brcmf_pcie_bus_ops = {
+ .preinit = brcmf_pcie_preinit,
.txdata = brcmf_pcie_tx,
.stop = brcmf_pcie_down,
.txctl = brcmf_pcie_tx_ctlpkt,
@@ -1795,9 +1808,6 @@ static void brcmf_pcie_setup(struct device *dev, int ret,
init_waitqueue_head(&devinfo->mbdata_resp_wait);
- brcmf_pcie_intr_enable(devinfo);
- brcmf_pcie_hostready(devinfo);
-
ret = brcmf_attach(&devinfo->pdev->dev);
if (ret)
goto fail;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b50255c83b914defd61a57fbc81d452334b63f4c Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan(a)marcan.st>
Date: Tue, 1 Feb 2022 01:07:10 +0900
Subject: [PATCH] brcmfmac: pcie: Fix crashes due to early IRQs
The driver was enabling IRQs before the message processing was
initialized. This could cause IRQs to come in too early and crash the
driver. Instead, move the IRQ enable and hostready to a bus preinit
function, at which point everything is properly initialized.
Fixes: 9e37f045d5e7 ("brcmfmac: Adding PCIe bus layer support.")
Reviewed-by: Linus Walleij <linus.walleij(a)linaro.org>
Reviewed-by: Arend van Spriel <arend.vanspriel(a)broadcom.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Hector Martin <marcan(a)marcan.st>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Signed-off-by: Kalle Valo <kvalo(a)kernel.org>
Link: https://lore.kernel.org/r/20220131160713.245637-7-marcan@marcan.st
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 3f3ca7612bcd..55f0111283c9 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -1315,6 +1315,18 @@ static void brcmf_pcie_down(struct device *dev)
{
}
+static int brcmf_pcie_preinit(struct device *dev)
+{
+ struct brcmf_bus *bus_if = dev_get_drvdata(dev);
+ struct brcmf_pciedev *buspub = bus_if->bus_priv.pcie;
+
+ brcmf_dbg(PCIE, "Enter\n");
+
+ brcmf_pcie_intr_enable(buspub->devinfo);
+ brcmf_pcie_hostready(buspub->devinfo);
+
+ return 0;
+}
static int brcmf_pcie_tx(struct device *dev, struct sk_buff *skb)
{
@@ -1423,6 +1435,7 @@ static int brcmf_pcie_reset(struct device *dev)
}
static const struct brcmf_bus_ops brcmf_pcie_bus_ops = {
+ .preinit = brcmf_pcie_preinit,
.txdata = brcmf_pcie_tx,
.stop = brcmf_pcie_down,
.txctl = brcmf_pcie_tx_ctlpkt,
@@ -1795,9 +1808,6 @@ static void brcmf_pcie_setup(struct device *dev, int ret,
init_waitqueue_head(&devinfo->mbdata_resp_wait);
- brcmf_pcie_intr_enable(devinfo);
- brcmf_pcie_hostready(devinfo);
-
ret = brcmf_attach(&devinfo->pdev->dev);
if (ret)
goto fail;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b50255c83b914defd61a57fbc81d452334b63f4c Mon Sep 17 00:00:00 2001
From: Hector Martin <marcan(a)marcan.st>
Date: Tue, 1 Feb 2022 01:07:10 +0900
Subject: [PATCH] brcmfmac: pcie: Fix crashes due to early IRQs
The driver was enabling IRQs before the message processing was
initialized. This could cause IRQs to come in too early and crash the
driver. Instead, move the IRQ enable and hostready to a bus preinit
function, at which point everything is properly initialized.
Fixes: 9e37f045d5e7 ("brcmfmac: Adding PCIe bus layer support.")
Reviewed-by: Linus Walleij <linus.walleij(a)linaro.org>
Reviewed-by: Arend van Spriel <arend.vanspriel(a)broadcom.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Hector Martin <marcan(a)marcan.st>
Reviewed-by: Andy Shevchenko <andy.shevchenko(a)gmail.com>
Signed-off-by: Kalle Valo <kvalo(a)kernel.org>
Link: https://lore.kernel.org/r/20220131160713.245637-7-marcan@marcan.st
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
index 3f3ca7612bcd..55f0111283c9 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/pcie.c
@@ -1315,6 +1315,18 @@ static void brcmf_pcie_down(struct device *dev)
{
}
+static int brcmf_pcie_preinit(struct device *dev)
+{
+ struct brcmf_bus *bus_if = dev_get_drvdata(dev);
+ struct brcmf_pciedev *buspub = bus_if->bus_priv.pcie;
+
+ brcmf_dbg(PCIE, "Enter\n");
+
+ brcmf_pcie_intr_enable(buspub->devinfo);
+ brcmf_pcie_hostready(buspub->devinfo);
+
+ return 0;
+}
static int brcmf_pcie_tx(struct device *dev, struct sk_buff *skb)
{
@@ -1423,6 +1435,7 @@ static int brcmf_pcie_reset(struct device *dev)
}
static const struct brcmf_bus_ops brcmf_pcie_bus_ops = {
+ .preinit = brcmf_pcie_preinit,
.txdata = brcmf_pcie_tx,
.stop = brcmf_pcie_down,
.txctl = brcmf_pcie_tx_ctlpkt,
@@ -1795,9 +1808,6 @@ static void brcmf_pcie_setup(struct device *dev, int ret,
init_waitqueue_head(&devinfo->mbdata_resp_wait);
- brcmf_pcie_intr_enable(devinfo);
- brcmf_pcie_hostready(devinfo);
-
ret = brcmf_attach(&devinfo->pdev->dev);
if (ret)
goto fail;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cd9f7f7ac5932129fe81b4c7559cfcb226ec7c5c Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann(a)suse.de>
Date: Tue, 1 Feb 2022 12:53:05 +0100
Subject: [PATCH] drm/fb-helper: Mark screen buffers in system memory with
FBINFO_VIRTFB
Mark screen buffers in system memory with FBINFO_VIRTFB. Otherwise, fbdev
deferred I/O marks mmap'ed areas of system memory with VM_IO. (There's an
inverse relationship between the two flags.)
For shadow buffers, also set the FBINFO_READS_FAST hint.
v3:
* change FB_ to FBINFO_ in commit description
v2:
* updated commit description (Daniel)
* added Fixes tag
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Fixes: d536540f304c ("drm/fb-helper: Add generic fbdev emulation .fb_probe function")
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v4.19+
Link: https://patchwork.freedesktop.org/patch/msgid/20220201115305.9333-1-tzimmer…
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 9727a59d35fd..805c5a666490 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -2340,6 +2340,7 @@ static int drm_fb_helper_generic_probe(struct drm_fb_helper *fb_helper,
fbi->fbops = &drm_fbdev_fb_ops;
fbi->screen_size = sizes->surface_height * fb->pitches[0];
fbi->fix.smem_len = fbi->screen_size;
+ fbi->flags = FBINFO_DEFAULT;
drm_fb_helper_fill_info(fbi, fb_helper, sizes);
@@ -2347,19 +2348,21 @@ static int drm_fb_helper_generic_probe(struct drm_fb_helper *fb_helper,
fbi->screen_buffer = vzalloc(fbi->screen_size);
if (!fbi->screen_buffer)
return -ENOMEM;
+ fbi->flags |= FBINFO_VIRTFB | FBINFO_READS_FAST;
fbi->fbdefio = &drm_fbdev_defio;
-
fb_deferred_io_init(fbi);
} else {
/* buffer is mapped for HW framebuffer */
ret = drm_client_buffer_vmap(fb_helper->buffer, &map);
if (ret)
return ret;
- if (map.is_iomem)
+ if (map.is_iomem) {
fbi->screen_base = map.vaddr_iomem;
- else
+ } else {
fbi->screen_buffer = map.vaddr;
+ fbi->flags |= FBINFO_VIRTFB;
+ }
/*
* Shamelessly leak the physical address to user-space. As
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cd9f7f7ac5932129fe81b4c7559cfcb226ec7c5c Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann(a)suse.de>
Date: Tue, 1 Feb 2022 12:53:05 +0100
Subject: [PATCH] drm/fb-helper: Mark screen buffers in system memory with
FBINFO_VIRTFB
Mark screen buffers in system memory with FBINFO_VIRTFB. Otherwise, fbdev
deferred I/O marks mmap'ed areas of system memory with VM_IO. (There's an
inverse relationship between the two flags.)
For shadow buffers, also set the FBINFO_READS_FAST hint.
v3:
* change FB_ to FBINFO_ in commit description
v2:
* updated commit description (Daniel)
* added Fixes tag
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Fixes: d536540f304c ("drm/fb-helper: Add generic fbdev emulation .fb_probe function")
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v4.19+
Link: https://patchwork.freedesktop.org/patch/msgid/20220201115305.9333-1-tzimmer…
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 9727a59d35fd..805c5a666490 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -2340,6 +2340,7 @@ static int drm_fb_helper_generic_probe(struct drm_fb_helper *fb_helper,
fbi->fbops = &drm_fbdev_fb_ops;
fbi->screen_size = sizes->surface_height * fb->pitches[0];
fbi->fix.smem_len = fbi->screen_size;
+ fbi->flags = FBINFO_DEFAULT;
drm_fb_helper_fill_info(fbi, fb_helper, sizes);
@@ -2347,19 +2348,21 @@ static int drm_fb_helper_generic_probe(struct drm_fb_helper *fb_helper,
fbi->screen_buffer = vzalloc(fbi->screen_size);
if (!fbi->screen_buffer)
return -ENOMEM;
+ fbi->flags |= FBINFO_VIRTFB | FBINFO_READS_FAST;
fbi->fbdefio = &drm_fbdev_defio;
-
fb_deferred_io_init(fbi);
} else {
/* buffer is mapped for HW framebuffer */
ret = drm_client_buffer_vmap(fb_helper->buffer, &map);
if (ret)
return ret;
- if (map.is_iomem)
+ if (map.is_iomem) {
fbi->screen_base = map.vaddr_iomem;
- else
+ } else {
fbi->screen_buffer = map.vaddr;
+ fbi->flags |= FBINFO_VIRTFB;
+ }
/*
* Shamelessly leak the physical address to user-space. As
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From cd9f7f7ac5932129fe81b4c7559cfcb226ec7c5c Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann(a)suse.de>
Date: Tue, 1 Feb 2022 12:53:05 +0100
Subject: [PATCH] drm/fb-helper: Mark screen buffers in system memory with
FBINFO_VIRTFB
Mark screen buffers in system memory with FBINFO_VIRTFB. Otherwise, fbdev
deferred I/O marks mmap'ed areas of system memory with VM_IO. (There's an
inverse relationship between the two flags.)
For shadow buffers, also set the FBINFO_READS_FAST hint.
v3:
* change FB_ to FBINFO_ in commit description
v2:
* updated commit description (Daniel)
* added Fixes tag
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Fixes: d536540f304c ("drm/fb-helper: Add generic fbdev emulation .fb_probe function")
Reviewed-by: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v4.19+
Link: https://patchwork.freedesktop.org/patch/msgid/20220201115305.9333-1-tzimmer…
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 9727a59d35fd..805c5a666490 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -2340,6 +2340,7 @@ static int drm_fb_helper_generic_probe(struct drm_fb_helper *fb_helper,
fbi->fbops = &drm_fbdev_fb_ops;
fbi->screen_size = sizes->surface_height * fb->pitches[0];
fbi->fix.smem_len = fbi->screen_size;
+ fbi->flags = FBINFO_DEFAULT;
drm_fb_helper_fill_info(fbi, fb_helper, sizes);
@@ -2347,19 +2348,21 @@ static int drm_fb_helper_generic_probe(struct drm_fb_helper *fb_helper,
fbi->screen_buffer = vzalloc(fbi->screen_size);
if (!fbi->screen_buffer)
return -ENOMEM;
+ fbi->flags |= FBINFO_VIRTFB | FBINFO_READS_FAST;
fbi->fbdefio = &drm_fbdev_defio;
-
fb_deferred_io_init(fbi);
} else {
/* buffer is mapped for HW framebuffer */
ret = drm_client_buffer_vmap(fb_helper->buffer, &map);
if (ret)
return ret;
- if (map.is_iomem)
+ if (map.is_iomem) {
fbi->screen_base = map.vaddr_iomem;
- else
+ } else {
fbi->screen_buffer = map.vaddr;
+ fbi->flags |= FBINFO_VIRTFB;
+ }
/*
* Shamelessly leak the physical address to user-space. As
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c0573ba5c5a2244dc02060b1f374d4593c1d20b7 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Date: Tue, 1 Mar 2022 15:33:40 +0100
Subject: [PATCH] KVM: s390x: fix SCK locking
When handling the SCK instruction, the kvm lock is taken, even though
the vcpu lock is already being held. The normal locking order is kvm
lock first and then vcpu lock. This is can (and in some circumstances
does) lead to deadlocks.
The function kvm_s390_set_tod_clock is called both by the SCK handler
and by some IOCTLs to set the clock. The IOCTLs will not hold the vcpu
lock, so they can safely take the kvm lock. The SCK handler holds the
vcpu lock, but will also somehow need to acquire the kvm lock without
relinquishing the vcpu lock.
The solution is to factor out the code to set the clock, and provide
two wrappers. One is called like the original function and does the
locking, the other is called kvm_s390_try_set_tod_clock and uses
trylock to try to acquire the kvm lock. This new wrapper is then used
in the SCK handler. If locking fails, -EAGAIN is returned, which is
eventually propagated to userspace, thus also freeing the vcpu lock and
allowing for forward progress.
This is not the most efficient or elegant way to solve this issue, but
the SCK instruction is deprecated and its performance is not critical.
The goal of this patch is just to provide a simple but correct way to
fix the bug.
Fixes: 6a3f95a6b04c ("KVM: s390: Intercept SCK instruction")
Signed-off-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Reviewed-by: Janis Schoetterl-Glausch <scgl(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20220301143340.111129-1-imbrenda@linux.ibm.com
Cc: stable(a)vger.kernel.org
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b5ea95cf8686..b53ff693b66e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3961,14 +3961,12 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
return 0;
}
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod)
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
{
struct kvm_vcpu *vcpu;
union tod_clock clk;
unsigned long i;
- mutex_lock(&kvm->lock);
preempt_disable();
store_tod_clock_ext(&clk);
@@ -3989,7 +3987,22 @@ void kvm_s390_set_tod_clock(struct kvm *kvm,
kvm_s390_vcpu_unblock_all(kvm);
preempt_enable();
+}
+
+void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
+{
+ mutex_lock(&kvm->lock);
+ __kvm_s390_set_tod_clock(kvm, gtod);
+ mutex_unlock(&kvm->lock);
+}
+
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
+{
+ if (!mutex_trylock(&kvm->lock))
+ return 0;
+ __kvm_s390_set_tod_clock(kvm, gtod);
mutex_unlock(&kvm->lock);
+ return 1;
}
/**
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 4ba8fc30d87a..798955b62fa3 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -358,8 +358,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod);
+void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 30b24c42ef99..5beb7a4a11b3 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -102,7 +102,20 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_cond(vcpu, rc);
VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod);
- kvm_s390_set_tod_clock(vcpu->kvm, >od);
+ /*
+ * To set the TOD clock the kvm lock must be taken, but the vcpu lock
+ * is already held in handle_set_clock. The usual lock order is the
+ * opposite. As SCK is deprecated and should not be used in several
+ * cases, for example when the multiple epoch facility or TOD clock
+ * steering facility is installed (see Principles of Operation), a
+ * slow path can be used. If the lock can not be taken via try_lock,
+ * the instruction will be retried via -EAGAIN at a later point in
+ * time.
+ */
+ if (!kvm_s390_try_set_tod_clock(vcpu->kvm, >od)) {
+ kvm_s390_retry_instr(vcpu);
+ return -EAGAIN;
+ }
kvm_s390_set_psw_cc(vcpu, 0);
return 0;
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From c0573ba5c5a2244dc02060b1f374d4593c1d20b7 Mon Sep 17 00:00:00 2001
From: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Date: Tue, 1 Mar 2022 15:33:40 +0100
Subject: [PATCH] KVM: s390x: fix SCK locking
When handling the SCK instruction, the kvm lock is taken, even though
the vcpu lock is already being held. The normal locking order is kvm
lock first and then vcpu lock. This is can (and in some circumstances
does) lead to deadlocks.
The function kvm_s390_set_tod_clock is called both by the SCK handler
and by some IOCTLs to set the clock. The IOCTLs will not hold the vcpu
lock, so they can safely take the kvm lock. The SCK handler holds the
vcpu lock, but will also somehow need to acquire the kvm lock without
relinquishing the vcpu lock.
The solution is to factor out the code to set the clock, and provide
two wrappers. One is called like the original function and does the
locking, the other is called kvm_s390_try_set_tod_clock and uses
trylock to try to acquire the kvm lock. This new wrapper is then used
in the SCK handler. If locking fails, -EAGAIN is returned, which is
eventually propagated to userspace, thus also freeing the vcpu lock and
allowing for forward progress.
This is not the most efficient or elegant way to solve this issue, but
the SCK instruction is deprecated and its performance is not critical.
The goal of this patch is just to provide a simple but correct way to
fix the bug.
Fixes: 6a3f95a6b04c ("KVM: s390: Intercept SCK instruction")
Signed-off-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Reviewed-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
Reviewed-by: Janis Schoetterl-Glausch <scgl(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20220301143340.111129-1-imbrenda@linux.ibm.com
Cc: stable(a)vger.kernel.org
Signed-off-by: Christian Borntraeger <borntraeger(a)linux.ibm.com>
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b5ea95cf8686..b53ff693b66e 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -3961,14 +3961,12 @@ static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
return 0;
}
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod)
+static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
{
struct kvm_vcpu *vcpu;
union tod_clock clk;
unsigned long i;
- mutex_lock(&kvm->lock);
preempt_disable();
store_tod_clock_ext(&clk);
@@ -3989,7 +3987,22 @@ void kvm_s390_set_tod_clock(struct kvm *kvm,
kvm_s390_vcpu_unblock_all(kvm);
preempt_enable();
+}
+
+void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
+{
+ mutex_lock(&kvm->lock);
+ __kvm_s390_set_tod_clock(kvm, gtod);
+ mutex_unlock(&kvm->lock);
+}
+
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod)
+{
+ if (!mutex_trylock(&kvm->lock))
+ return 0;
+ __kvm_s390_set_tod_clock(kvm, gtod);
mutex_unlock(&kvm->lock);
+ return 1;
}
/**
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 4ba8fc30d87a..798955b62fa3 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -358,8 +358,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
-void kvm_s390_set_tod_clock(struct kvm *kvm,
- const struct kvm_s390_vm_tod_clock *gtod);
+void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
+int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 30b24c42ef99..5beb7a4a11b3 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -102,7 +102,20 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_cond(vcpu, rc);
VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod);
- kvm_s390_set_tod_clock(vcpu->kvm, >od);
+ /*
+ * To set the TOD clock the kvm lock must be taken, but the vcpu lock
+ * is already held in handle_set_clock. The usual lock order is the
+ * opposite. As SCK is deprecated and should not be used in several
+ * cases, for example when the multiple epoch facility or TOD clock
+ * steering facility is installed (see Principles of Operation), a
+ * slow path can be used. If the lock can not be taken via try_lock,
+ * the instruction will be retried via -EAGAIN at a later point in
+ * time.
+ */
+ if (!kvm_s390_try_set_tod_clock(vcpu->kvm, >od)) {
+ kvm_s390_retry_instr(vcpu);
+ return -EAGAIN;
+ }
kvm_s390_set_psw_cc(vcpu, 0);
return 0;
The patch below does not apply to the 5.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79c9234ba596e903907de20573fd4bcc85315b06 Mon Sep 17 00:00:00 2001
From: Dongliang Mu <mudongliangabcd(a)gmail.com>
Date: Thu, 3 Mar 2022 22:40:27 +0800
Subject: [PATCH] btrfs: don't access possibly stale fs_info data in
device_list_add
Syzbot reported a possible use-after-free in printing information
in device_list_add.
Very similar with the bug fixed by commit 0697d9a61099 ("btrfs: don't
access possibly stale fs_info data for printing duplicate device"),
but this time the use occurs in btrfs_info_in_rcu.
Call Trace:
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
btrfs_printk+0x395/0x425 fs/btrfs/super.c:244
device_list_add.cold+0xd7/0x2ed fs/btrfs/volumes.c:957
btrfs_scan_one_device+0x4c7/0x5c0 fs/btrfs/volumes.c:1387
btrfs_control_ioctl+0x12a/0x2d0 fs/btrfs/super.c:2409
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Fix this by modifying device->fs_info to NULL too.
Reported-and-tested-by: syzbot+82650a4e0ed38f218363(a)syzkaller.appspotmail.com
CC: stable(a)vger.kernel.org # 4.19+
Signed-off-by: Dongliang Mu <mudongliangabcd(a)gmail.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e3e13d4940b..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -921,16 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -938,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79c9234ba596e903907de20573fd4bcc85315b06 Mon Sep 17 00:00:00 2001
From: Dongliang Mu <mudongliangabcd(a)gmail.com>
Date: Thu, 3 Mar 2022 22:40:27 +0800
Subject: [PATCH] btrfs: don't access possibly stale fs_info data in
device_list_add
Syzbot reported a possible use-after-free in printing information
in device_list_add.
Very similar with the bug fixed by commit 0697d9a61099 ("btrfs: don't
access possibly stale fs_info data for printing duplicate device"),
but this time the use occurs in btrfs_info_in_rcu.
Call Trace:
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
btrfs_printk+0x395/0x425 fs/btrfs/super.c:244
device_list_add.cold+0xd7/0x2ed fs/btrfs/volumes.c:957
btrfs_scan_one_device+0x4c7/0x5c0 fs/btrfs/volumes.c:1387
btrfs_control_ioctl+0x12a/0x2d0 fs/btrfs/super.c:2409
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Fix this by modifying device->fs_info to NULL too.
Reported-and-tested-by: syzbot+82650a4e0ed38f218363(a)syzkaller.appspotmail.com
CC: stable(a)vger.kernel.org # 4.19+
Signed-off-by: Dongliang Mu <mudongliangabcd(a)gmail.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e3e13d4940b..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -921,16 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -938,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79c9234ba596e903907de20573fd4bcc85315b06 Mon Sep 17 00:00:00 2001
From: Dongliang Mu <mudongliangabcd(a)gmail.com>
Date: Thu, 3 Mar 2022 22:40:27 +0800
Subject: [PATCH] btrfs: don't access possibly stale fs_info data in
device_list_add
Syzbot reported a possible use-after-free in printing information
in device_list_add.
Very similar with the bug fixed by commit 0697d9a61099 ("btrfs: don't
access possibly stale fs_info data for printing duplicate device"),
but this time the use occurs in btrfs_info_in_rcu.
Call Trace:
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
btrfs_printk+0x395/0x425 fs/btrfs/super.c:244
device_list_add.cold+0xd7/0x2ed fs/btrfs/volumes.c:957
btrfs_scan_one_device+0x4c7/0x5c0 fs/btrfs/volumes.c:1387
btrfs_control_ioctl+0x12a/0x2d0 fs/btrfs/super.c:2409
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Fix this by modifying device->fs_info to NULL too.
Reported-and-tested-by: syzbot+82650a4e0ed38f218363(a)syzkaller.appspotmail.com
CC: stable(a)vger.kernel.org # 4.19+
Signed-off-by: Dongliang Mu <mudongliangabcd(a)gmail.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e3e13d4940b..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -921,16 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -938,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79c9234ba596e903907de20573fd4bcc85315b06 Mon Sep 17 00:00:00 2001
From: Dongliang Mu <mudongliangabcd(a)gmail.com>
Date: Thu, 3 Mar 2022 22:40:27 +0800
Subject: [PATCH] btrfs: don't access possibly stale fs_info data in
device_list_add
Syzbot reported a possible use-after-free in printing information
in device_list_add.
Very similar with the bug fixed by commit 0697d9a61099 ("btrfs: don't
access possibly stale fs_info data for printing duplicate device"),
but this time the use occurs in btrfs_info_in_rcu.
Call Trace:
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
btrfs_printk+0x395/0x425 fs/btrfs/super.c:244
device_list_add.cold+0xd7/0x2ed fs/btrfs/volumes.c:957
btrfs_scan_one_device+0x4c7/0x5c0 fs/btrfs/volumes.c:1387
btrfs_control_ioctl+0x12a/0x2d0 fs/btrfs/super.c:2409
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Fix this by modifying device->fs_info to NULL too.
Reported-and-tested-by: syzbot+82650a4e0ed38f218363(a)syzkaller.appspotmail.com
CC: stable(a)vger.kernel.org # 4.19+
Signed-off-by: Dongliang Mu <mudongliangabcd(a)gmail.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e3e13d4940b..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -921,16 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -938,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79c9234ba596e903907de20573fd4bcc85315b06 Mon Sep 17 00:00:00 2001
From: Dongliang Mu <mudongliangabcd(a)gmail.com>
Date: Thu, 3 Mar 2022 22:40:27 +0800
Subject: [PATCH] btrfs: don't access possibly stale fs_info data in
device_list_add
Syzbot reported a possible use-after-free in printing information
in device_list_add.
Very similar with the bug fixed by commit 0697d9a61099 ("btrfs: don't
access possibly stale fs_info data for printing duplicate device"),
but this time the use occurs in btrfs_info_in_rcu.
Call Trace:
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
btrfs_printk+0x395/0x425 fs/btrfs/super.c:244
device_list_add.cold+0xd7/0x2ed fs/btrfs/volumes.c:957
btrfs_scan_one_device+0x4c7/0x5c0 fs/btrfs/volumes.c:1387
btrfs_control_ioctl+0x12a/0x2d0 fs/btrfs/super.c:2409
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Fix this by modifying device->fs_info to NULL too.
Reported-and-tested-by: syzbot+82650a4e0ed38f218363(a)syzkaller.appspotmail.com
CC: stable(a)vger.kernel.org # 4.19+
Signed-off-by: Dongliang Mu <mudongliangabcd(a)gmail.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e3e13d4940b..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -921,16 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -938,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79c9234ba596e903907de20573fd4bcc85315b06 Mon Sep 17 00:00:00 2001
From: Dongliang Mu <mudongliangabcd(a)gmail.com>
Date: Thu, 3 Mar 2022 22:40:27 +0800
Subject: [PATCH] btrfs: don't access possibly stale fs_info data in
device_list_add
Syzbot reported a possible use-after-free in printing information
in device_list_add.
Very similar with the bug fixed by commit 0697d9a61099 ("btrfs: don't
access possibly stale fs_info data for printing duplicate device"),
but this time the use occurs in btrfs_info_in_rcu.
Call Trace:
kasan_report.cold+0x83/0xdf mm/kasan/report.c:459
btrfs_printk+0x395/0x425 fs/btrfs/super.c:244
device_list_add.cold+0xd7/0x2ed fs/btrfs/volumes.c:957
btrfs_scan_one_device+0x4c7/0x5c0 fs/btrfs/volumes.c:1387
btrfs_control_ioctl+0x12a/0x2d0 fs/btrfs/super.c:2409
vfs_ioctl fs/ioctl.c:51 [inline]
__do_sys_ioctl fs/ioctl.c:874 [inline]
__se_sys_ioctl fs/ioctl.c:860 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
Fix this by modifying device->fs_info to NULL too.
Reported-and-tested-by: syzbot+82650a4e0ed38f218363(a)syzkaller.appspotmail.com
CC: stable(a)vger.kernel.org # 4.19+
Signed-off-by: Dongliang Mu <mudongliangabcd(a)gmail.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e3e13d4940b..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -921,16 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -938,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d4568fc8525897e683983806f813be1ae9eedaed Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook(a)chromium.org>
Date: Mon, 24 Jan 2022 18:29:52 +0100
Subject: [PATCH] media: omap3isp: Use struct_group() for memcpy() region
In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields. Wrap the target region
in struct_group(). This additionally fixes a theoretical misalignment
of the copy (since the size of "buf" changes between 64-bit and 32-bit,
but this is likely never built for 64-bit).
FWIW, I think this code is totally broken on 64-bit (which appears to
not be a "real" build configuration): it would either always fail (with
an uninitialized data->buf_size) or would cause corruption in userspace
due to the copy_to_user() in the call path against an uninitialized
data->buf value:
omap3isp_stat_request_statistics_time32(...)
struct omap3isp_stat_data data64;
...
omap3isp_stat_request_statistics(stat, &data64);
int omap3isp_stat_request_statistics(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
buf = isp_stat_buf_get(stat, data);
static struct ispstat_buffer *isp_stat_buf_get(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
if (buf->buf_size > data->buf_size) {
...
return ERR_PTR(-EINVAL);
}
...
rval = copy_to_user(data->buf,
buf->virt_addr,
buf->buf_size);
Regardless, additionally initialize data64 to be zero-filled to avoid
undefined behavior.
Link: https://lore.kernel.org/lkml/20211215220505.GB21862@embeddedor
Cc: Arnd Bergmann <arnd(a)arndb.de>
Fixes: 378e3f81cb56 ("media: omap3isp: support 64-bit version of omap3isp_stat_data")
Cc: stable(a)vger.kernel.org
Reviewed-by: Gustavo A. R. Silva <gustavoars(a)kernel.org>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
diff --git a/drivers/media/platform/omap3isp/ispstat.c b/drivers/media/platform/omap3isp/ispstat.c
index 5b9b57f4d9bf..68cf68dbcace 100644
--- a/drivers/media/platform/omap3isp/ispstat.c
+++ b/drivers/media/platform/omap3isp/ispstat.c
@@ -512,7 +512,7 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
struct omap3isp_stat_data_time32 *data)
{
- struct omap3isp_stat_data data64;
+ struct omap3isp_stat_data data64 = { };
int ret;
ret = omap3isp_stat_request_statistics(stat, &data64);
@@ -521,7 +521,8 @@ int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
data->ts.tv_sec = data64.ts.tv_sec;
data->ts.tv_usec = data64.ts.tv_usec;
- memcpy(&data->buf, &data64.buf, sizeof(*data) - sizeof(data->ts));
+ data->buf = (uintptr_t)data64.buf;
+ memcpy(&data->frame, &data64.frame, sizeof(data->frame));
return 0;
}
diff --git a/include/uapi/linux/omap3isp.h b/include/uapi/linux/omap3isp.h
index 87b55755f4ff..d9db7ad43890 100644
--- a/include/uapi/linux/omap3isp.h
+++ b/include/uapi/linux/omap3isp.h
@@ -162,6 +162,7 @@ struct omap3isp_h3a_aewb_config {
* struct omap3isp_stat_data - Statistic data sent to or received from user
* @ts: Timestamp of returned framestats.
* @buf: Pointer to pass to user.
+ * @buf_size: Size of buffer.
* @frame_number: Frame number of requested stats.
* @cur_frame: Current frame number being processed.
* @config_counter: Number of the configuration associated with the data.
@@ -176,10 +177,12 @@ struct omap3isp_stat_data {
struct timeval ts;
#endif
void __user *buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#ifdef __KERNEL__
@@ -189,10 +192,12 @@ struct omap3isp_stat_data_time32 {
__s32 tv_usec;
} ts;
__u32 buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#endif
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d4568fc8525897e683983806f813be1ae9eedaed Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook(a)chromium.org>
Date: Mon, 24 Jan 2022 18:29:52 +0100
Subject: [PATCH] media: omap3isp: Use struct_group() for memcpy() region
In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields. Wrap the target region
in struct_group(). This additionally fixes a theoretical misalignment
of the copy (since the size of "buf" changes between 64-bit and 32-bit,
but this is likely never built for 64-bit).
FWIW, I think this code is totally broken on 64-bit (which appears to
not be a "real" build configuration): it would either always fail (with
an uninitialized data->buf_size) or would cause corruption in userspace
due to the copy_to_user() in the call path against an uninitialized
data->buf value:
omap3isp_stat_request_statistics_time32(...)
struct omap3isp_stat_data data64;
...
omap3isp_stat_request_statistics(stat, &data64);
int omap3isp_stat_request_statistics(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
buf = isp_stat_buf_get(stat, data);
static struct ispstat_buffer *isp_stat_buf_get(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
if (buf->buf_size > data->buf_size) {
...
return ERR_PTR(-EINVAL);
}
...
rval = copy_to_user(data->buf,
buf->virt_addr,
buf->buf_size);
Regardless, additionally initialize data64 to be zero-filled to avoid
undefined behavior.
Link: https://lore.kernel.org/lkml/20211215220505.GB21862@embeddedor
Cc: Arnd Bergmann <arnd(a)arndb.de>
Fixes: 378e3f81cb56 ("media: omap3isp: support 64-bit version of omap3isp_stat_data")
Cc: stable(a)vger.kernel.org
Reviewed-by: Gustavo A. R. Silva <gustavoars(a)kernel.org>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
diff --git a/drivers/media/platform/omap3isp/ispstat.c b/drivers/media/platform/omap3isp/ispstat.c
index 5b9b57f4d9bf..68cf68dbcace 100644
--- a/drivers/media/platform/omap3isp/ispstat.c
+++ b/drivers/media/platform/omap3isp/ispstat.c
@@ -512,7 +512,7 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
struct omap3isp_stat_data_time32 *data)
{
- struct omap3isp_stat_data data64;
+ struct omap3isp_stat_data data64 = { };
int ret;
ret = omap3isp_stat_request_statistics(stat, &data64);
@@ -521,7 +521,8 @@ int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
data->ts.tv_sec = data64.ts.tv_sec;
data->ts.tv_usec = data64.ts.tv_usec;
- memcpy(&data->buf, &data64.buf, sizeof(*data) - sizeof(data->ts));
+ data->buf = (uintptr_t)data64.buf;
+ memcpy(&data->frame, &data64.frame, sizeof(data->frame));
return 0;
}
diff --git a/include/uapi/linux/omap3isp.h b/include/uapi/linux/omap3isp.h
index 87b55755f4ff..d9db7ad43890 100644
--- a/include/uapi/linux/omap3isp.h
+++ b/include/uapi/linux/omap3isp.h
@@ -162,6 +162,7 @@ struct omap3isp_h3a_aewb_config {
* struct omap3isp_stat_data - Statistic data sent to or received from user
* @ts: Timestamp of returned framestats.
* @buf: Pointer to pass to user.
+ * @buf_size: Size of buffer.
* @frame_number: Frame number of requested stats.
* @cur_frame: Current frame number being processed.
* @config_counter: Number of the configuration associated with the data.
@@ -176,10 +177,12 @@ struct omap3isp_stat_data {
struct timeval ts;
#endif
void __user *buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#ifdef __KERNEL__
@@ -189,10 +192,12 @@ struct omap3isp_stat_data_time32 {
__s32 tv_usec;
} ts;
__u32 buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#endif
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9b30430ea356f237945e52f8a3a42158877bd5a9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:02 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - only allow with rsa
The pkcs1pad template can be instantiated with an arbitrary akcipher
algorithm, which doesn't make sense; it is specifically an RSA padding
scheme. Make it check that the underlying algorithm really is RSA.
Fixes: 3d5b1ecdea6f ("crypto: rsa - RSA padding algorithm")
Cc: <stable(a)vger.kernel.org> # v4.5+
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 8ac3e73e8ea6..1b3545781425 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -621,6 +621,11 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
rsa_alg = crypto_spawn_akcipher_alg(&ctx->spawn);
+ if (strcmp(rsa_alg->base.cra_name, "rsa") != 0) {
+ err = -EINVAL;
+ goto err_free_inst;
+ }
+
err = -ENAMETOOLONG;
hash_name = crypto_attr_alg_name(tb[2]);
if (IS_ERR(hash_name)) {
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d4568fc8525897e683983806f813be1ae9eedaed Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook(a)chromium.org>
Date: Mon, 24 Jan 2022 18:29:52 +0100
Subject: [PATCH] media: omap3isp: Use struct_group() for memcpy() region
In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields. Wrap the target region
in struct_group(). This additionally fixes a theoretical misalignment
of the copy (since the size of "buf" changes between 64-bit and 32-bit,
but this is likely never built for 64-bit).
FWIW, I think this code is totally broken on 64-bit (which appears to
not be a "real" build configuration): it would either always fail (with
an uninitialized data->buf_size) or would cause corruption in userspace
due to the copy_to_user() in the call path against an uninitialized
data->buf value:
omap3isp_stat_request_statistics_time32(...)
struct omap3isp_stat_data data64;
...
omap3isp_stat_request_statistics(stat, &data64);
int omap3isp_stat_request_statistics(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
buf = isp_stat_buf_get(stat, data);
static struct ispstat_buffer *isp_stat_buf_get(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
if (buf->buf_size > data->buf_size) {
...
return ERR_PTR(-EINVAL);
}
...
rval = copy_to_user(data->buf,
buf->virt_addr,
buf->buf_size);
Regardless, additionally initialize data64 to be zero-filled to avoid
undefined behavior.
Link: https://lore.kernel.org/lkml/20211215220505.GB21862@embeddedor
Cc: Arnd Bergmann <arnd(a)arndb.de>
Fixes: 378e3f81cb56 ("media: omap3isp: support 64-bit version of omap3isp_stat_data")
Cc: stable(a)vger.kernel.org
Reviewed-by: Gustavo A. R. Silva <gustavoars(a)kernel.org>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
diff --git a/drivers/media/platform/omap3isp/ispstat.c b/drivers/media/platform/omap3isp/ispstat.c
index 5b9b57f4d9bf..68cf68dbcace 100644
--- a/drivers/media/platform/omap3isp/ispstat.c
+++ b/drivers/media/platform/omap3isp/ispstat.c
@@ -512,7 +512,7 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
struct omap3isp_stat_data_time32 *data)
{
- struct omap3isp_stat_data data64;
+ struct omap3isp_stat_data data64 = { };
int ret;
ret = omap3isp_stat_request_statistics(stat, &data64);
@@ -521,7 +521,8 @@ int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
data->ts.tv_sec = data64.ts.tv_sec;
data->ts.tv_usec = data64.ts.tv_usec;
- memcpy(&data->buf, &data64.buf, sizeof(*data) - sizeof(data->ts));
+ data->buf = (uintptr_t)data64.buf;
+ memcpy(&data->frame, &data64.frame, sizeof(data->frame));
return 0;
}
diff --git a/include/uapi/linux/omap3isp.h b/include/uapi/linux/omap3isp.h
index 87b55755f4ff..d9db7ad43890 100644
--- a/include/uapi/linux/omap3isp.h
+++ b/include/uapi/linux/omap3isp.h
@@ -162,6 +162,7 @@ struct omap3isp_h3a_aewb_config {
* struct omap3isp_stat_data - Statistic data sent to or received from user
* @ts: Timestamp of returned framestats.
* @buf: Pointer to pass to user.
+ * @buf_size: Size of buffer.
* @frame_number: Frame number of requested stats.
* @cur_frame: Current frame number being processed.
* @config_counter: Number of the configuration associated with the data.
@@ -176,10 +177,12 @@ struct omap3isp_stat_data {
struct timeval ts;
#endif
void __user *buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#ifdef __KERNEL__
@@ -189,10 +192,12 @@ struct omap3isp_stat_data_time32 {
__s32 tv_usec;
} ts;
__u32 buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#endif
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d4568fc8525897e683983806f813be1ae9eedaed Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook(a)chromium.org>
Date: Mon, 24 Jan 2022 18:29:52 +0100
Subject: [PATCH] media: omap3isp: Use struct_group() for memcpy() region
In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memcpy(), memmove(), and memset(), avoid
intentionally writing across neighboring fields. Wrap the target region
in struct_group(). This additionally fixes a theoretical misalignment
of the copy (since the size of "buf" changes between 64-bit and 32-bit,
but this is likely never built for 64-bit).
FWIW, I think this code is totally broken on 64-bit (which appears to
not be a "real" build configuration): it would either always fail (with
an uninitialized data->buf_size) or would cause corruption in userspace
due to the copy_to_user() in the call path against an uninitialized
data->buf value:
omap3isp_stat_request_statistics_time32(...)
struct omap3isp_stat_data data64;
...
omap3isp_stat_request_statistics(stat, &data64);
int omap3isp_stat_request_statistics(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
buf = isp_stat_buf_get(stat, data);
static struct ispstat_buffer *isp_stat_buf_get(struct ispstat *stat,
struct omap3isp_stat_data *data)
...
if (buf->buf_size > data->buf_size) {
...
return ERR_PTR(-EINVAL);
}
...
rval = copy_to_user(data->buf,
buf->virt_addr,
buf->buf_size);
Regardless, additionally initialize data64 to be zero-filled to avoid
undefined behavior.
Link: https://lore.kernel.org/lkml/20211215220505.GB21862@embeddedor
Cc: Arnd Bergmann <arnd(a)arndb.de>
Fixes: 378e3f81cb56 ("media: omap3isp: support 64-bit version of omap3isp_stat_data")
Cc: stable(a)vger.kernel.org
Reviewed-by: Gustavo A. R. Silva <gustavoars(a)kernel.org>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
Reviewed-by: Laurent Pinchart <laurent.pinchart(a)ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab(a)kernel.org>
diff --git a/drivers/media/platform/omap3isp/ispstat.c b/drivers/media/platform/omap3isp/ispstat.c
index 5b9b57f4d9bf..68cf68dbcace 100644
--- a/drivers/media/platform/omap3isp/ispstat.c
+++ b/drivers/media/platform/omap3isp/ispstat.c
@@ -512,7 +512,7 @@ int omap3isp_stat_request_statistics(struct ispstat *stat,
int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
struct omap3isp_stat_data_time32 *data)
{
- struct omap3isp_stat_data data64;
+ struct omap3isp_stat_data data64 = { };
int ret;
ret = omap3isp_stat_request_statistics(stat, &data64);
@@ -521,7 +521,8 @@ int omap3isp_stat_request_statistics_time32(struct ispstat *stat,
data->ts.tv_sec = data64.ts.tv_sec;
data->ts.tv_usec = data64.ts.tv_usec;
- memcpy(&data->buf, &data64.buf, sizeof(*data) - sizeof(data->ts));
+ data->buf = (uintptr_t)data64.buf;
+ memcpy(&data->frame, &data64.frame, sizeof(data->frame));
return 0;
}
diff --git a/include/uapi/linux/omap3isp.h b/include/uapi/linux/omap3isp.h
index 87b55755f4ff..d9db7ad43890 100644
--- a/include/uapi/linux/omap3isp.h
+++ b/include/uapi/linux/omap3isp.h
@@ -162,6 +162,7 @@ struct omap3isp_h3a_aewb_config {
* struct omap3isp_stat_data - Statistic data sent to or received from user
* @ts: Timestamp of returned framestats.
* @buf: Pointer to pass to user.
+ * @buf_size: Size of buffer.
* @frame_number: Frame number of requested stats.
* @cur_frame: Current frame number being processed.
* @config_counter: Number of the configuration associated with the data.
@@ -176,10 +177,12 @@ struct omap3isp_stat_data {
struct timeval ts;
#endif
void __user *buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#ifdef __KERNEL__
@@ -189,10 +192,12 @@ struct omap3isp_stat_data_time32 {
__s32 tv_usec;
} ts;
__u32 buf;
- __u32 buf_size;
- __u16 frame_number;
- __u16 cur_frame;
- __u16 config_counter;
+ __struct_group(/* no tag */, frame, /* no attrs */,
+ __u32 buf_size;
+ __u16 frame_number;
+ __u16 cur_frame;
+ __u16 config_counter;
+ );
};
#endif
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9b30430ea356f237945e52f8a3a42158877bd5a9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:02 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - only allow with rsa
The pkcs1pad template can be instantiated with an arbitrary akcipher
algorithm, which doesn't make sense; it is specifically an RSA padding
scheme. Make it check that the underlying algorithm really is RSA.
Fixes: 3d5b1ecdea6f ("crypto: rsa - RSA padding algorithm")
Cc: <stable(a)vger.kernel.org> # v4.5+
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 8ac3e73e8ea6..1b3545781425 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -621,6 +621,11 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
rsa_alg = crypto_spawn_akcipher_alg(&ctx->spawn);
+ if (strcmp(rsa_alg->base.cra_name, "rsa") != 0) {
+ err = -EINVAL;
+ goto err_free_inst;
+ }
+
err = -ENAMETOOLONG;
hash_name = crypto_attr_alg_name(tb[2]);
if (IS_ERR(hash_name)) {
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9b30430ea356f237945e52f8a3a42158877bd5a9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:02 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - only allow with rsa
The pkcs1pad template can be instantiated with an arbitrary akcipher
algorithm, which doesn't make sense; it is specifically an RSA padding
scheme. Make it check that the underlying algorithm really is RSA.
Fixes: 3d5b1ecdea6f ("crypto: rsa - RSA padding algorithm")
Cc: <stable(a)vger.kernel.org> # v4.5+
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 8ac3e73e8ea6..1b3545781425 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -621,6 +621,11 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
rsa_alg = crypto_spawn_akcipher_alg(&ctx->spawn);
+ if (strcmp(rsa_alg->base.cra_name, "rsa") != 0) {
+ err = -EINVAL;
+ goto err_free_inst;
+ }
+
err = -ENAMETOOLONG;
hash_name = crypto_attr_alg_name(tb[2]);
if (IS_ERR(hash_name)) {
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9b30430ea356f237945e52f8a3a42158877bd5a9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:02 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - only allow with rsa
The pkcs1pad template can be instantiated with an arbitrary akcipher
algorithm, which doesn't make sense; it is specifically an RSA padding
scheme. Make it check that the underlying algorithm really is RSA.
Fixes: 3d5b1ecdea6f ("crypto: rsa - RSA padding algorithm")
Cc: <stable(a)vger.kernel.org> # v4.5+
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 8ac3e73e8ea6..1b3545781425 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -621,6 +621,11 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
rsa_alg = crypto_spawn_akcipher_alg(&ctx->spawn);
+ if (strcmp(rsa_alg->base.cra_name, "rsa") != 0) {
+ err = -EINVAL;
+ goto err_free_inst;
+ }
+
err = -ENAMETOOLONG;
hash_name = crypto_attr_alg_name(tb[2]);
if (IS_ERR(hash_name)) {
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a24611ea356c7f3f0ec926da11b9482ac1f414fd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:05 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - fix buffer overread in
pkcs1pad_verify_complete()
Before checking whether the expected digest_info is present, we need to
check that there are enough bytes remaining.
Fixes: a49de377e051 ("crypto: Add hash param to pkcs1pad")
Cc: <stable(a)vger.kernel.org> # v4.6+
Cc: Tadeusz Struk <tadeusz.struk(a)linaro.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 6b556ddeb3a0..9d804831c8b3 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -476,6 +476,8 @@ static int pkcs1pad_verify_complete(struct akcipher_request *req, int err)
pos++;
if (digest_info) {
+ if (digest_info->size > dst_len - pos)
+ goto done;
if (crypto_memneq(out_buf + pos, digest_info->data,
digest_info->size))
goto done;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a24611ea356c7f3f0ec926da11b9482ac1f414fd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:05 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - fix buffer overread in
pkcs1pad_verify_complete()
Before checking whether the expected digest_info is present, we need to
check that there are enough bytes remaining.
Fixes: a49de377e051 ("crypto: Add hash param to pkcs1pad")
Cc: <stable(a)vger.kernel.org> # v4.6+
Cc: Tadeusz Struk <tadeusz.struk(a)linaro.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 6b556ddeb3a0..9d804831c8b3 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -476,6 +476,8 @@ static int pkcs1pad_verify_complete(struct akcipher_request *req, int err)
pos++;
if (digest_info) {
+ if (digest_info->size > dst_len - pos)
+ goto done;
if (crypto_memneq(out_buf + pos, digest_info->data,
digest_info->size))
goto done;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a24611ea356c7f3f0ec926da11b9482ac1f414fd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:05 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - fix buffer overread in
pkcs1pad_verify_complete()
Before checking whether the expected digest_info is present, we need to
check that there are enough bytes remaining.
Fixes: a49de377e051 ("crypto: Add hash param to pkcs1pad")
Cc: <stable(a)vger.kernel.org> # v4.6+
Cc: Tadeusz Struk <tadeusz.struk(a)linaro.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 6b556ddeb3a0..9d804831c8b3 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -476,6 +476,8 @@ static int pkcs1pad_verify_complete(struct akcipher_request *req, int err)
pos++;
if (digest_info) {
+ if (digest_info->size > dst_len - pos)
+ goto done;
if (crypto_memneq(out_buf + pos, digest_info->data,
digest_info->size))
goto done;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d3481accd974541e6a5d6a1fb588924a3519c36e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:04 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - restore signature length check
RSA PKCS#1 v1.5 signatures are required to be the same length as the RSA
key size. RFC8017 specifically requires the verifier to check this
(https://datatracker.ietf.org/doc/html/rfc8017#section-8.2.2).
Commit a49de377e051 ("crypto: Add hash param to pkcs1pad") changed the
kernel to allow longer signatures, but didn't explain this part of the
change; it seems to be unrelated to the rest of the commit.
Revert this change, since it doesn't appear to be correct.
We can be pretty sure that no one is relying on overly-long signatures
(which would have to be front-padded with zeroes) being supported, given
that they would have been broken since commit c7381b012872
("crypto: akcipher - new verify API for public key algorithms").
Fixes: a49de377e051 ("crypto: Add hash param to pkcs1pad")
Cc: <stable(a)vger.kernel.org> # v4.6+
Cc: Tadeusz Struk <tadeusz.struk(a)linaro.org>
Suggested-by: Vitaly Chikunov <vt(a)altlinux.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 7b223adebabf..6b556ddeb3a0 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -538,7 +538,7 @@ static int pkcs1pad_verify(struct akcipher_request *req)
if (WARN_ON(req->dst) ||
WARN_ON(!req->dst_len) ||
- !ctx->key_size || req->src_len < ctx->key_size)
+ !ctx->key_size || req->src_len != ctx->key_size)
return -EINVAL;
req_ctx->out_buf = kmalloc(ctx->key_size + req->dst_len, GFP_KERNEL);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d3481accd974541e6a5d6a1fb588924a3519c36e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:04 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - restore signature length check
RSA PKCS#1 v1.5 signatures are required to be the same length as the RSA
key size. RFC8017 specifically requires the verifier to check this
(https://datatracker.ietf.org/doc/html/rfc8017#section-8.2.2).
Commit a49de377e051 ("crypto: Add hash param to pkcs1pad") changed the
kernel to allow longer signatures, but didn't explain this part of the
change; it seems to be unrelated to the rest of the commit.
Revert this change, since it doesn't appear to be correct.
We can be pretty sure that no one is relying on overly-long signatures
(which would have to be front-padded with zeroes) being supported, given
that they would have been broken since commit c7381b012872
("crypto: akcipher - new verify API for public key algorithms").
Fixes: a49de377e051 ("crypto: Add hash param to pkcs1pad")
Cc: <stable(a)vger.kernel.org> # v4.6+
Cc: Tadeusz Struk <tadeusz.struk(a)linaro.org>
Suggested-by: Vitaly Chikunov <vt(a)altlinux.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 7b223adebabf..6b556ddeb3a0 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -538,7 +538,7 @@ static int pkcs1pad_verify(struct akcipher_request *req)
if (WARN_ON(req->dst) ||
WARN_ON(!req->dst_len) ||
- !ctx->key_size || req->src_len < ctx->key_size)
+ !ctx->key_size || req->src_len != ctx->key_size)
return -EINVAL;
req_ctx->out_buf = kmalloc(ctx->key_size + req->dst_len, GFP_KERNEL);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d3481accd974541e6a5d6a1fb588924a3519c36e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers(a)google.com>
Date: Tue, 18 Jan 2022 16:13:04 -0800
Subject: [PATCH] crypto: rsa-pkcs1pad - restore signature length check
RSA PKCS#1 v1.5 signatures are required to be the same length as the RSA
key size. RFC8017 specifically requires the verifier to check this
(https://datatracker.ietf.org/doc/html/rfc8017#section-8.2.2).
Commit a49de377e051 ("crypto: Add hash param to pkcs1pad") changed the
kernel to allow longer signatures, but didn't explain this part of the
change; it seems to be unrelated to the rest of the commit.
Revert this change, since it doesn't appear to be correct.
We can be pretty sure that no one is relying on overly-long signatures
(which would have to be front-padded with zeroes) being supported, given
that they would have been broken since commit c7381b012872
("crypto: akcipher - new verify API for public key algorithms").
Fixes: a49de377e051 ("crypto: Add hash param to pkcs1pad")
Cc: <stable(a)vger.kernel.org> # v4.6+
Cc: Tadeusz Struk <tadeusz.struk(a)linaro.org>
Suggested-by: Vitaly Chikunov <vt(a)altlinux.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
Signed-off-by: Herbert Xu <herbert(a)gondor.apana.org.au>
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index 7b223adebabf..6b556ddeb3a0 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -538,7 +538,7 @@ static int pkcs1pad_verify(struct akcipher_request *req)
if (WARN_ON(req->dst) ||
WARN_ON(!req->dst_len) ||
- !ctx->key_size || req->src_len < ctx->key_size)
+ !ctx->key_size || req->src_len != ctx->key_size)
return -EINVAL;
req_ctx->out_buf = kmalloc(ctx->key_size + req->dst_len, GFP_KERNEL);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8126b1c73108bc691f5643df19071a59a69d0bc6 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Mon, 14 Mar 2022 19:59:53 +0100
Subject: [PATCH] pstore: Don't use semaphores in always-atomic-context code
pstore_dump() is *always* invoked in atomic context (nowadays in an RCU
read-side critical section, before that under a spinlock).
It doesn't make sense to try to use semaphores here.
This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock
to semaphore"), except that two parts aren't restored back exactly as they
were:
- keep the lock initialization in pstore_register
- in efi_pstore_write(), always set the "block" flag to false
- omit "is_locked", that was unnecessary since
commit 959217c84c27 ("pstore: Actually give up during locking failure")
- fix the bailout message
The actual problem that the buggy commit was trying to address may have
been that the use of preemptible() in efi_pstore_write() was wrong - it
only looks at preempt_count() and the state of IRQs, but __rcu_read_lock()
doesn't touch either of those under CONFIG_PREEMPT_RCU.
(Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in
RCU read-side critical sections, but you're not allowed to actively
block/reschedule.)
Lockdep probably never caught the problem because it's very rare that you
actually hit the contended case, so lockdep always just sees the
down_trylock(), not the down_interruptible(), and so it can't tell that
there's a problem.
Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore")
Cc: stable(a)vger.kernel.org
Acked-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Signed-off-by: Jann Horn <jannh(a)google.com>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 0ef086e43090..7e771c56c13c 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -266,7 +266,7 @@ static int efi_pstore_write(struct pstore_record *record)
efi_name[i] = name[i];
ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
- preemptible(), record->size, record->psi->buf);
+ false, record->size, record->psi->buf);
if (record->reason == KMSG_DUMP_OOPS && try_module_get(THIS_MODULE))
if (!schedule_work(&efivar_work))
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f243cb5e6a4f..e26162f102ff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@ static void pstore_timer_kick(void)
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
- return;
- }
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index eb93a54cff31..e97a8188f0fd 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -14,7 +14,7 @@
#include <linux/errno.h>
#include <linux/kmsg_dump.h>
#include <linux/mutex.h>
-#include <linux/semaphore.h>
+#include <linux/spinlock.h>
#include <linux/time.h>
#include <linux/types.h>
@@ -87,7 +87,7 @@ struct pstore_record {
* @owner: module which is responsible for this backend driver
* @name: name of the backend driver
*
- * @buf_lock: semaphore to serialize access to @buf
+ * @buf_lock: spinlock to serialize access to @buf
* @buf: preallocated crash dump buffer
* @bufsize: size of @buf available for crash dump bytes (must match
* smallest number of bytes available for writing to a
@@ -178,7 +178,7 @@ struct pstore_info {
struct module *owner;
const char *name;
- struct semaphore buf_lock;
+ spinlock_t buf_lock;
char *buf;
size_t bufsize;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8126b1c73108bc691f5643df19071a59a69d0bc6 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Mon, 14 Mar 2022 19:59:53 +0100
Subject: [PATCH] pstore: Don't use semaphores in always-atomic-context code
pstore_dump() is *always* invoked in atomic context (nowadays in an RCU
read-side critical section, before that under a spinlock).
It doesn't make sense to try to use semaphores here.
This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock
to semaphore"), except that two parts aren't restored back exactly as they
were:
- keep the lock initialization in pstore_register
- in efi_pstore_write(), always set the "block" flag to false
- omit "is_locked", that was unnecessary since
commit 959217c84c27 ("pstore: Actually give up during locking failure")
- fix the bailout message
The actual problem that the buggy commit was trying to address may have
been that the use of preemptible() in efi_pstore_write() was wrong - it
only looks at preempt_count() and the state of IRQs, but __rcu_read_lock()
doesn't touch either of those under CONFIG_PREEMPT_RCU.
(Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in
RCU read-side critical sections, but you're not allowed to actively
block/reschedule.)
Lockdep probably never caught the problem because it's very rare that you
actually hit the contended case, so lockdep always just sees the
down_trylock(), not the down_interruptible(), and so it can't tell that
there's a problem.
Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore")
Cc: stable(a)vger.kernel.org
Acked-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Signed-off-by: Jann Horn <jannh(a)google.com>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 0ef086e43090..7e771c56c13c 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -266,7 +266,7 @@ static int efi_pstore_write(struct pstore_record *record)
efi_name[i] = name[i];
ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
- preemptible(), record->size, record->psi->buf);
+ false, record->size, record->psi->buf);
if (record->reason == KMSG_DUMP_OOPS && try_module_get(THIS_MODULE))
if (!schedule_work(&efivar_work))
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f243cb5e6a4f..e26162f102ff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@ static void pstore_timer_kick(void)
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
- return;
- }
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index eb93a54cff31..e97a8188f0fd 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -14,7 +14,7 @@
#include <linux/errno.h>
#include <linux/kmsg_dump.h>
#include <linux/mutex.h>
-#include <linux/semaphore.h>
+#include <linux/spinlock.h>
#include <linux/time.h>
#include <linux/types.h>
@@ -87,7 +87,7 @@ struct pstore_record {
* @owner: module which is responsible for this backend driver
* @name: name of the backend driver
*
- * @buf_lock: semaphore to serialize access to @buf
+ * @buf_lock: spinlock to serialize access to @buf
* @buf: preallocated crash dump buffer
* @bufsize: size of @buf available for crash dump bytes (must match
* smallest number of bytes available for writing to a
@@ -178,7 +178,7 @@ struct pstore_info {
struct module *owner;
const char *name;
- struct semaphore buf_lock;
+ spinlock_t buf_lock;
char *buf;
size_t bufsize;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8126b1c73108bc691f5643df19071a59a69d0bc6 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh(a)google.com>
Date: Mon, 14 Mar 2022 19:59:53 +0100
Subject: [PATCH] pstore: Don't use semaphores in always-atomic-context code
pstore_dump() is *always* invoked in atomic context (nowadays in an RCU
read-side critical section, before that under a spinlock).
It doesn't make sense to try to use semaphores here.
This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock
to semaphore"), except that two parts aren't restored back exactly as they
were:
- keep the lock initialization in pstore_register
- in efi_pstore_write(), always set the "block" flag to false
- omit "is_locked", that was unnecessary since
commit 959217c84c27 ("pstore: Actually give up during locking failure")
- fix the bailout message
The actual problem that the buggy commit was trying to address may have
been that the use of preemptible() in efi_pstore_write() was wrong - it
only looks at preempt_count() and the state of IRQs, but __rcu_read_lock()
doesn't touch either of those under CONFIG_PREEMPT_RCU.
(Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in
RCU read-side critical sections, but you're not allowed to actively
block/reschedule.)
Lockdep probably never caught the problem because it's very rare that you
actually hit the contended case, so lockdep always just sees the
down_trylock(), not the down_interruptible(), and so it can't tell that
there's a problem.
Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore")
Cc: stable(a)vger.kernel.org
Acked-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Signed-off-by: Jann Horn <jannh(a)google.com>
Signed-off-by: Kees Cook <keescook(a)chromium.org>
Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 0ef086e43090..7e771c56c13c 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -266,7 +266,7 @@ static int efi_pstore_write(struct pstore_record *record)
efi_name[i] = name[i];
ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
- preemptible(), record->size, record->psi->buf);
+ false, record->size, record->psi->buf);
if (record->reason == KMSG_DUMP_OOPS && try_module_get(THIS_MODULE))
if (!schedule_work(&efivar_work))
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f243cb5e6a4f..e26162f102ff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@ static void pstore_timer_kick(void)
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
- return;
- }
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index eb93a54cff31..e97a8188f0fd 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -14,7 +14,7 @@
#include <linux/errno.h>
#include <linux/kmsg_dump.h>
#include <linux/mutex.h>
-#include <linux/semaphore.h>
+#include <linux/spinlock.h>
#include <linux/time.h>
#include <linux/types.h>
@@ -87,7 +87,7 @@ struct pstore_record {
* @owner: module which is responsible for this backend driver
* @name: name of the backend driver
*
- * @buf_lock: semaphore to serialize access to @buf
+ * @buf_lock: spinlock to serialize access to @buf
* @buf: preallocated crash dump buffer
* @bufsize: size of @buf available for crash dump bytes (must match
* smallest number of bytes available for writing to a
@@ -178,7 +178,7 @@ struct pstore_info {
struct module *owner;
const char *name;
- struct semaphore buf_lock;
+ spinlock_t buf_lock;
char *buf;
size_t bufsize;
The patch below does not apply to the 5.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7344bad7fb6daa4877a1c064b52c7d5f9182c41b Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Wed, 23 Mar 2022 12:04:38 +0200
Subject: [PATCH] drm/edid: fix CEA extension byte #3 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Only an EDID CEA extension has byte #3, while the CTA DisplayID Data
Block does not. Don't interpret bogus data for color formats.
For most displays it's probably an unlikely scenario you'd have a CTA
DisplayID Data Block without a CEA extension, but they do exist.
Fixes: e28ad544f462 ("drm/edid: parse CEA blocks embedded in DisplayID")
Cc: <stable(a)vger.kernel.org>
Cc: Shawn C Lee <shawn.c.lee(a)intel.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220323100438.1757295-1-jani…
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index f07af6786cec..cc7bd58369df 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -5188,10 +5188,14 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
/* The existence of a CEA block should imply RGB support */
info->color_formats = DRM_COLOR_FORMAT_RGB444;
- if (edid_ext[3] & EDID_CEA_YCRCB444)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
- if (edid_ext[3] & EDID_CEA_YCRCB422)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+
+ /* CTA DisplayID Data Block does not have byte #3 */
+ if (edid_ext[0] == CEA_EXT) {
+ if (edid_ext[3] & EDID_CEA_YCRCB444)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
+ if (edid_ext[3] & EDID_CEA_YCRCB422)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+ }
if (cea_db_offsets(edid_ext, &start, &end))
return;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7344bad7fb6daa4877a1c064b52c7d5f9182c41b Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Wed, 23 Mar 2022 12:04:38 +0200
Subject: [PATCH] drm/edid: fix CEA extension byte #3 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Only an EDID CEA extension has byte #3, while the CTA DisplayID Data
Block does not. Don't interpret bogus data for color formats.
For most displays it's probably an unlikely scenario you'd have a CTA
DisplayID Data Block without a CEA extension, but they do exist.
Fixes: e28ad544f462 ("drm/edid: parse CEA blocks embedded in DisplayID")
Cc: <stable(a)vger.kernel.org>
Cc: Shawn C Lee <shawn.c.lee(a)intel.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220323100438.1757295-1-jani…
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index f07af6786cec..cc7bd58369df 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -5188,10 +5188,14 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
/* The existence of a CEA block should imply RGB support */
info->color_formats = DRM_COLOR_FORMAT_RGB444;
- if (edid_ext[3] & EDID_CEA_YCRCB444)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
- if (edid_ext[3] & EDID_CEA_YCRCB422)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+
+ /* CTA DisplayID Data Block does not have byte #3 */
+ if (edid_ext[0] == CEA_EXT) {
+ if (edid_ext[3] & EDID_CEA_YCRCB444)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
+ if (edid_ext[3] & EDID_CEA_YCRCB422)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+ }
if (cea_db_offsets(edid_ext, &start, &end))
return;
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7344bad7fb6daa4877a1c064b52c7d5f9182c41b Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Wed, 23 Mar 2022 12:04:38 +0200
Subject: [PATCH] drm/edid: fix CEA extension byte #3 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Only an EDID CEA extension has byte #3, while the CTA DisplayID Data
Block does not. Don't interpret bogus data for color formats.
For most displays it's probably an unlikely scenario you'd have a CTA
DisplayID Data Block without a CEA extension, but they do exist.
Fixes: e28ad544f462 ("drm/edid: parse CEA blocks embedded in DisplayID")
Cc: <stable(a)vger.kernel.org>
Cc: Shawn C Lee <shawn.c.lee(a)intel.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220323100438.1757295-1-jani…
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index f07af6786cec..cc7bd58369df 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -5188,10 +5188,14 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
/* The existence of a CEA block should imply RGB support */
info->color_formats = DRM_COLOR_FORMAT_RGB444;
- if (edid_ext[3] & EDID_CEA_YCRCB444)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
- if (edid_ext[3] & EDID_CEA_YCRCB422)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+
+ /* CTA DisplayID Data Block does not have byte #3 */
+ if (edid_ext[0] == CEA_EXT) {
+ if (edid_ext[3] & EDID_CEA_YCRCB444)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
+ if (edid_ext[3] & EDID_CEA_YCRCB422)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+ }
if (cea_db_offsets(edid_ext, &start, &end))
return;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7344bad7fb6daa4877a1c064b52c7d5f9182c41b Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Wed, 23 Mar 2022 12:04:38 +0200
Subject: [PATCH] drm/edid: fix CEA extension byte #3 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Only an EDID CEA extension has byte #3, while the CTA DisplayID Data
Block does not. Don't interpret bogus data for color formats.
For most displays it's probably an unlikely scenario you'd have a CTA
DisplayID Data Block without a CEA extension, but they do exist.
Fixes: e28ad544f462 ("drm/edid: parse CEA blocks embedded in DisplayID")
Cc: <stable(a)vger.kernel.org>
Cc: Shawn C Lee <shawn.c.lee(a)intel.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220323100438.1757295-1-jani…
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index f07af6786cec..cc7bd58369df 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -5188,10 +5188,14 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
/* The existence of a CEA block should imply RGB support */
info->color_formats = DRM_COLOR_FORMAT_RGB444;
- if (edid_ext[3] & EDID_CEA_YCRCB444)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
- if (edid_ext[3] & EDID_CEA_YCRCB422)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+
+ /* CTA DisplayID Data Block does not have byte #3 */
+ if (edid_ext[0] == CEA_EXT) {
+ if (edid_ext[3] & EDID_CEA_YCRCB444)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
+ if (edid_ext[3] & EDID_CEA_YCRCB422)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+ }
if (cea_db_offsets(edid_ext, &start, &end))
return;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7344bad7fb6daa4877a1c064b52c7d5f9182c41b Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Wed, 23 Mar 2022 12:04:38 +0200
Subject: [PATCH] drm/edid: fix CEA extension byte #3 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Only an EDID CEA extension has byte #3, while the CTA DisplayID Data
Block does not. Don't interpret bogus data for color formats.
For most displays it's probably an unlikely scenario you'd have a CTA
DisplayID Data Block without a CEA extension, but they do exist.
Fixes: e28ad544f462 ("drm/edid: parse CEA blocks embedded in DisplayID")
Cc: <stable(a)vger.kernel.org>
Cc: Shawn C Lee <shawn.c.lee(a)intel.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220323100438.1757295-1-jani…
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index f07af6786cec..cc7bd58369df 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -5188,10 +5188,14 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
/* The existence of a CEA block should imply RGB support */
info->color_formats = DRM_COLOR_FORMAT_RGB444;
- if (edid_ext[3] & EDID_CEA_YCRCB444)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
- if (edid_ext[3] & EDID_CEA_YCRCB422)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+
+ /* CTA DisplayID Data Block does not have byte #3 */
+ if (edid_ext[0] == CEA_EXT) {
+ if (edid_ext[3] & EDID_CEA_YCRCB444)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
+ if (edid_ext[3] & EDID_CEA_YCRCB422)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+ }
if (cea_db_offsets(edid_ext, &start, &end))
return;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7344bad7fb6daa4877a1c064b52c7d5f9182c41b Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula(a)intel.com>
Date: Wed, 23 Mar 2022 12:04:38 +0200
Subject: [PATCH] drm/edid: fix CEA extension byte #3 parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Only an EDID CEA extension has byte #3, while the CTA DisplayID Data
Block does not. Don't interpret bogus data for color formats.
For most displays it's probably an unlikely scenario you'd have a CTA
DisplayID Data Block without a CEA extension, but they do exist.
Fixes: e28ad544f462 ("drm/edid: parse CEA blocks embedded in DisplayID")
Cc: <stable(a)vger.kernel.org>
Cc: Shawn C Lee <shawn.c.lee(a)intel.com>
Cc: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Signed-off-by: Jani Nikula <jani.nikula(a)intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala(a)linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220323100438.1757295-1-jani…
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index f07af6786cec..cc7bd58369df 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -5188,10 +5188,14 @@ static void drm_parse_cea_ext(struct drm_connector *connector,
/* The existence of a CEA block should imply RGB support */
info->color_formats = DRM_COLOR_FORMAT_RGB444;
- if (edid_ext[3] & EDID_CEA_YCRCB444)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
- if (edid_ext[3] & EDID_CEA_YCRCB422)
- info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+
+ /* CTA DisplayID Data Block does not have byte #3 */
+ if (edid_ext[0] == CEA_EXT) {
+ if (edid_ext[3] & EDID_CEA_YCRCB444)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR444;
+ if (edid_ext[3] & EDID_CEA_YCRCB422)
+ info->color_formats |= DRM_COLOR_FORMAT_YCBCR422;
+ }
if (cea_db_offsets(edid_ext, &start, &end))
return;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From aa1b46dcdc7baaf5fec0be25782ef24b26aa209e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj(a)kernel.org>
Date: Sun, 13 Mar 2022 21:15:02 -1000
Subject: [PATCH] block: fix rq-qos breakage from skipping rq_qos_done_bio()
a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't
tracked") made bio_endio() skip rq_qos_done_bio() if BIO_TRACKED is not set.
While this fixed a potential oops, it also broke blk-iocost by skipping the
done_bio callback for merged bios.
Before, whether a bio goes through rq_qos_throttle() or rq_qos_merge(),
rq_qos_done_bio() would be called on the bio on completion with BIO_TRACKED
distinguishing the former from the latter. rq_qos_done_bio() is not called
for bios which wenth through rq_qos_merge(). This royally confuses
blk-iocost as the merged bios never finish and are considered perpetually
in-flight.
One reliably reproducible failure mode is an intermediate cgroup geting
stuck active preventing its children from being activated due to the
leaf-only rule, leading to loss of control. The following is from
resctl-bench protection scenario which emulates isolating a web server like
workload from a memory bomb run on an iocost configuration which should
yield a reasonable level of protection.
# cat /sys/block/nvme2n1/device/model
Samsung SSD 970 PRO 512GB
# cat /sys/fs/cgroup/io.cost.model
259:0 ctrl=user model=linear rbps=834913556 rseqiops=93622 rrandiops=102913 wbps=618985353 wseqiops=72325 wrandiops=71025
# cat /sys/fs/cgroup/io.cost.qos
259:0 enable=1 ctrl=user rpct=95.00 rlat=18776 wpct=95.00 wlat=8897 min=60.00 max=100.00
# resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1
...
Memory Hog Summary
==================
IO Latency: R p50=242u:336u/2.5m p90=794u:1.4m/7.5m p99=2.7m:8.0m/62.5m max=8.0m:36.4m/350m
W p50=221u:323u/1.5m p90=709u:1.2m/5.5m p99=1.5m:2.5m/9.5m max=6.9m:35.9m/350m
Isolation and Request Latency Impact Distributions:
min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev
isol% 15.90 15.90 15.90 40.05 57.24 59.07 60.01 74.63 74.63 90.35 90.35 58.12 15.82
lat-imp% 0 0 0 0 0 4.55 14.68 15.54 233.5 548.1 548.1 53.88 143.6
Result: isol=58.12:15.82% lat_imp=53.88%:143.6 work_csv=100.0% missing=3.96%
The isolation result of 58.12% is close to what this device would show
without any IO control.
Fix it by introducing a new flag BIO_QOS_MERGED to mark merged bios and
calling rq_qos_done_bio() on them too. For consistency and clarity, rename
BIO_TRACKED to BIO_QOS_THROTTLED. The flag checks are moved into
rq_qos_done_bio() so that it's next to the code paths that set the flags.
With the patch applied, the above same benchmark shows:
# resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1
...
Memory Hog Summary
==================
IO Latency: R p50=123u:84.4u/985u p90=322u:256u/2.5m p99=1.6m:1.4m/9.5m max=11.1m:36.0m/350m
W p50=429u:274u/995u p90=1.7m:1.3m/4.5m p99=3.4m:2.7m/11.5m max=7.9m:5.9m/26.5m
Isolation and Request Latency Impact Distributions:
min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev
isol% 84.91 84.91 89.51 90.73 92.31 94.49 96.36 98.04 98.71 100.0 100.0 94.42 2.81
lat-imp% 0 0 0 0 0 2.81 5.73 11.11 13.92 17.53 22.61 4.10 4.68
Result: isol=94.42:2.81% lat_imp=4.10%:4.68 work_csv=58.34% missing=0%
Signed-off-by: Tejun Heo <tj(a)kernel.org>
Fixes: a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked")
Cc: stable(a)vger.kernel.org # v5.15+
Cc: Ming Lei <ming.lei(a)redhat.com>
Cc: Yu Kuai <yukuai3(a)huawei.com>
Reviewed-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/Yi7rdrzQEHjJLGKB@slm.duckdns.org
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/block/bio.c b/block/bio.c
index 151cace2dbe1..33979f306e9e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1516,8 +1516,7 @@ void bio_endio(struct bio *bio)
if (!bio_integrity_endio(bio))
return;
- if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
- rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
+ rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 010e658d44a8..2f33932e72e3 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
int inflight = 0;
blkg = bio->bi_blkg;
- if (!blkg || !bio_flagged(bio, BIO_TRACKED))
+ if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
return;
iolat = blkg_to_lat(bio->bi_blkg);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 3cfbc8668cba..68267007da1c 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
__rq_qos_requeue(q->rq_qos, rq);
}
-static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+static inline void rq_qos_done_bio(struct bio *bio)
{
- if (q->rq_qos)
- __rq_qos_done_bio(q->rq_qos, bio);
+ if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+ bio_flagged(bio, BIO_QOS_MERGED))) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ if (q->rq_qos)
+ __rq_qos_done_bio(q->rq_qos, bio);
+ }
}
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- /*
- * BIO_TRACKED lets controllers know that a bio went through the
- * normal rq_qos path.
- */
if (q->rq_qos) {
- bio_set_flag(bio, BIO_TRACKED);
+ bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
}
@@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (q->rq_qos) {
+ bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
+ }
}
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 5561e58d158a..0c3563b45fe9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -324,7 +324,8 @@ enum {
BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion
* of this bio. */
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
- BIO_TRACKED, /* set if bio goes through the rq_qos path */
+ BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */
+ BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 031495635b4668f94e964e037ca93d0d38bfde58 Mon Sep 17 00:00:00 2001
From: Vijay Balakrishna <vijayb(a)linux.microsoft.com>
Date: Wed, 2 Mar 2022 09:38:09 -0800
Subject: [PATCH] arm64: Do not defer reserve_crashkernel() for platforms with
no DMA memory zones
The following patches resulted in deferring crash kernel reservation to
mem_init(), mainly aimed at platforms with DMA memory zones (no IOMMU),
in particular Raspberry Pi 4.
commit 1a8e1cef7603 ("arm64: use both ZONE_DMA and ZONE_DMA32")
commit 8424ecdde7df ("arm64: mm: Set ZONE_DMA size based on devicetree's dma-ranges")
commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into mem_init()")
commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel reservation is required")
Above changes introduced boot slowdown due to linear map creation for
all the memory banks with NO_BLOCK_MAPPINGS, see discussion[1]. The proposed
changes restore crash kernel reservation to earlier behavior thus avoids
slow boot, particularly for platforms with IOMMU (no DMA memory zones).
Tested changes to confirm no ~150ms boot slowdown on our SoC with IOMMU
and 8GB memory. Also tested with ZONE_DMA and/or ZONE_DMA32 configs to confirm
no regression to deferring scheme of crash kernel memory reservation.
In both cases successfully collected kernel crash dump.
[1] https://lore.kernel.org/all/9436d033-579b-55fa-9b00-6f4b661c2dd7@linux.micr…
Signed-off-by: Vijay Balakrishna <vijayb(a)linux.microsoft.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Link: https://lore.kernel.org/r/1646242689-20744-1-git-send-email-vijayb@linux.mi…
[will: Add #ifdef CONFIG_KEXEC_CORE guards to fix 'crashk_res' references in allnoconfig build]
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index db63cc885771..919be440494f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -61,8 +61,34 @@ EXPORT_SYMBOL(memstart_addr);
* unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
* In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
* otherwise it is empty.
+ *
+ * Memory reservation for crash kernel either done early or deferred
+ * depending on DMA memory zones configs (ZONE_DMA) --
+ *
+ * In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
+ * here instead of max_zone_phys(). This lets early reservation of
+ * crash kernel memory which has a dependency on arm64_dma_phys_limit.
+ * Reserving memory early for crash kernel allows linear creation of block
+ * mappings (greater than page-granularity) for all the memory bank rangs.
+ * In this scheme a comparatively quicker boot is observed.
+ *
+ * If ZONE_DMA configs are defined, crash kernel memory reservation
+ * is delayed until DMA zone memory range size initilazation performed in
+ * zone_sizes_init(). The defer is necessary to steer clear of DMA zone
+ * memory range to avoid overlap allocation. So crash kernel memory boundaries
+ * are not known when mapping all bank memory ranges, which otherwise means
+ * not possible to exclude crash kernel range from creating block mappings
+ * so page-granularity mappings are created for the entire memory range.
+ * Hence a slightly slower boot is observed.
+ *
+ * Note: Page-granularity mapppings are necessary for crash kernel memory
+ * range for shrinking its size via /sys/kernel/kexec_crash_size interface.
*/
-phys_addr_t arm64_dma_phys_limit __ro_after_init;
+#if IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32)
+phys_addr_t __ro_after_init arm64_dma_phys_limit;
+#else
+const phys_addr_t arm64_dma_phys_limit = PHYS_MASK + 1;
+#endif
#ifdef CONFIG_KEXEC_CORE
/*
@@ -153,8 +179,6 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = dma32_phys_limit;
#endif
- if (!arm64_dma_phys_limit)
- arm64_dma_phys_limit = PHYS_MASK + 1;
max_zone_pfns[ZONE_NORMAL] = max;
free_area_init(max_zone_pfns);
@@ -315,6 +339,9 @@ void __init arm64_memblock_init(void)
early_init_fdt_scan_reserved_mem();
+ if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
+ reserve_crashkernel();
+
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
}
@@ -361,7 +388,8 @@ void __init bootmem_init(void)
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
- reserve_crashkernel();
+ if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
+ reserve_crashkernel();
memblock_dump_all();
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index acfae9b41cc8..ed21bf83d0b7 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -517,7 +517,7 @@ static void __init map_mem(pgd_t *pgdp)
*/
BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
- if (can_set_direct_map() || crash_mem_map || IS_ENABLED(CONFIG_KFENCE))
+ if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/*
@@ -528,6 +528,17 @@ static void __init map_mem(pgd_t *pgdp)
*/
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
+#ifdef CONFIG_KEXEC_CORE
+ if (crash_mem_map) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA) ||
+ IS_ENABLED(CONFIG_ZONE_DMA32))
+ flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+ else if (crashk_res.end)
+ memblock_mark_nomap(crashk_res.start,
+ resource_size(&crashk_res));
+ }
+#endif
+
/* map all the memory banks */
for_each_mem_range(i, &start, &end) {
if (start >= end)
@@ -554,6 +565,25 @@ static void __init map_mem(pgd_t *pgdp)
__map_memblock(pgdp, kernel_start, kernel_end,
PAGE_KERNEL, NO_CONT_MAPPINGS);
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
+
+ /*
+ * Use page-level mappings here so that we can shrink the region
+ * in page granularity and put back unused memory to buddy system
+ * through /sys/kernel/kexec_crash_size interface.
+ */
+#ifdef CONFIG_KEXEC_CORE
+ if (crash_mem_map &&
+ !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ if (crashk_res.end) {
+ __map_memblock(pgdp, crashk_res.start,
+ crashk_res.end + 1,
+ PAGE_KERNEL,
+ NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
+ memblock_clear_nomap(crashk_res.start,
+ resource_size(&crashk_res));
+ }
+ }
+#endif
}
void mark_rodata_ro(void)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 031495635b4668f94e964e037ca93d0d38bfde58 Mon Sep 17 00:00:00 2001
From: Vijay Balakrishna <vijayb(a)linux.microsoft.com>
Date: Wed, 2 Mar 2022 09:38:09 -0800
Subject: [PATCH] arm64: Do not defer reserve_crashkernel() for platforms with
no DMA memory zones
The following patches resulted in deferring crash kernel reservation to
mem_init(), mainly aimed at platforms with DMA memory zones (no IOMMU),
in particular Raspberry Pi 4.
commit 1a8e1cef7603 ("arm64: use both ZONE_DMA and ZONE_DMA32")
commit 8424ecdde7df ("arm64: mm: Set ZONE_DMA size based on devicetree's dma-ranges")
commit 0a30c53573b0 ("arm64: mm: Move reserve_crashkernel() into mem_init()")
commit 2687275a5843 ("arm64: Force NO_BLOCK_MAPPINGS if crashkernel reservation is required")
Above changes introduced boot slowdown due to linear map creation for
all the memory banks with NO_BLOCK_MAPPINGS, see discussion[1]. The proposed
changes restore crash kernel reservation to earlier behavior thus avoids
slow boot, particularly for platforms with IOMMU (no DMA memory zones).
Tested changes to confirm no ~150ms boot slowdown on our SoC with IOMMU
and 8GB memory. Also tested with ZONE_DMA and/or ZONE_DMA32 configs to confirm
no regression to deferring scheme of crash kernel memory reservation.
In both cases successfully collected kernel crash dump.
[1] https://lore.kernel.org/all/9436d033-579b-55fa-9b00-6f4b661c2dd7@linux.micr…
Signed-off-by: Vijay Balakrishna <vijayb(a)linux.microsoft.com>
Cc: stable(a)vger.kernel.org
Reviewed-by: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Link: https://lore.kernel.org/r/1646242689-20744-1-git-send-email-vijayb@linux.mi…
[will: Add #ifdef CONFIG_KEXEC_CORE guards to fix 'crashk_res' references in allnoconfig build]
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index db63cc885771..919be440494f 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -61,8 +61,34 @@ EXPORT_SYMBOL(memstart_addr);
* unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
* In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
* otherwise it is empty.
+ *
+ * Memory reservation for crash kernel either done early or deferred
+ * depending on DMA memory zones configs (ZONE_DMA) --
+ *
+ * In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
+ * here instead of max_zone_phys(). This lets early reservation of
+ * crash kernel memory which has a dependency on arm64_dma_phys_limit.
+ * Reserving memory early for crash kernel allows linear creation of block
+ * mappings (greater than page-granularity) for all the memory bank rangs.
+ * In this scheme a comparatively quicker boot is observed.
+ *
+ * If ZONE_DMA configs are defined, crash kernel memory reservation
+ * is delayed until DMA zone memory range size initilazation performed in
+ * zone_sizes_init(). The defer is necessary to steer clear of DMA zone
+ * memory range to avoid overlap allocation. So crash kernel memory boundaries
+ * are not known when mapping all bank memory ranges, which otherwise means
+ * not possible to exclude crash kernel range from creating block mappings
+ * so page-granularity mappings are created for the entire memory range.
+ * Hence a slightly slower boot is observed.
+ *
+ * Note: Page-granularity mapppings are necessary for crash kernel memory
+ * range for shrinking its size via /sys/kernel/kexec_crash_size interface.
*/
-phys_addr_t arm64_dma_phys_limit __ro_after_init;
+#if IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32)
+phys_addr_t __ro_after_init arm64_dma_phys_limit;
+#else
+const phys_addr_t arm64_dma_phys_limit = PHYS_MASK + 1;
+#endif
#ifdef CONFIG_KEXEC_CORE
/*
@@ -153,8 +179,6 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
if (!arm64_dma_phys_limit)
arm64_dma_phys_limit = dma32_phys_limit;
#endif
- if (!arm64_dma_phys_limit)
- arm64_dma_phys_limit = PHYS_MASK + 1;
max_zone_pfns[ZONE_NORMAL] = max;
free_area_init(max_zone_pfns);
@@ -315,6 +339,9 @@ void __init arm64_memblock_init(void)
early_init_fdt_scan_reserved_mem();
+ if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
+ reserve_crashkernel();
+
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
}
@@ -361,7 +388,8 @@ void __init bootmem_init(void)
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
- reserve_crashkernel();
+ if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
+ reserve_crashkernel();
memblock_dump_all();
}
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index acfae9b41cc8..ed21bf83d0b7 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -517,7 +517,7 @@ static void __init map_mem(pgd_t *pgdp)
*/
BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end));
- if (can_set_direct_map() || crash_mem_map || IS_ENABLED(CONFIG_KFENCE))
+ if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
/*
@@ -528,6 +528,17 @@ static void __init map_mem(pgd_t *pgdp)
*/
memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
+#ifdef CONFIG_KEXEC_CORE
+ if (crash_mem_map) {
+ if (IS_ENABLED(CONFIG_ZONE_DMA) ||
+ IS_ENABLED(CONFIG_ZONE_DMA32))
+ flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
+ else if (crashk_res.end)
+ memblock_mark_nomap(crashk_res.start,
+ resource_size(&crashk_res));
+ }
+#endif
+
/* map all the memory banks */
for_each_mem_range(i, &start, &end) {
if (start >= end)
@@ -554,6 +565,25 @@ static void __init map_mem(pgd_t *pgdp)
__map_memblock(pgdp, kernel_start, kernel_end,
PAGE_KERNEL, NO_CONT_MAPPINGS);
memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
+
+ /*
+ * Use page-level mappings here so that we can shrink the region
+ * in page granularity and put back unused memory to buddy system
+ * through /sys/kernel/kexec_crash_size interface.
+ */
+#ifdef CONFIG_KEXEC_CORE
+ if (crash_mem_map &&
+ !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ if (crashk_res.end) {
+ __map_memblock(pgdp, crashk_res.start,
+ crashk_res.end + 1,
+ PAGE_KERNEL,
+ NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
+ memblock_clear_nomap(crashk_res.start,
+ resource_size(&crashk_res));
+ }
+ }
+#endif
}
void mark_rodata_ro(void)
From: Ben Dooks <ben.dooks(a)codethink.co.uk>
commit a382c757ec5ef83137a86125f43a4c43dc2ab50b upstream.
The fu740 PCIe core does not probe any devices on the SiFive Unmatched
board without this fix (or having U-Boot explicitly start the PCIe via
either boot-script or user command). The fix is to start the link at
2.5GT/s speeds and once the link is up then change the maximum speed back
to the default.
The U-Boot driver claims to set the link-speed to 2.5GT/s to get the probe
to work (and U-Boot does print link up at 2.5GT/s) in the following code:
https://source.denx.de/u-boot/u-boot/-/blob/master/drivers/pci/pcie_dw_sifi…
Link: https://lore.kernel.org/r/20220318152430.526320-1-ben.dooks@codethink.co.uk
Signed-off-by: Ben Dooks <ben.dooks(a)codethink.co.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas(a)google.com>
Acked-by: Palmer Dabbelt <palmer(a)rivosinc.com>
Signed-off-by: Dimitri John Ledkov <dimitri.ledkov(a)canonical.com>
---
Please apply this patch to v5.15+ stable trees which fixes PCIe on
the very popular SiFive Unmatched RISC-V board.
drivers/pci/controller/dwc/pcie-fu740.c | 51 ++++++++++++++++++++++++-
1 file changed, 50 insertions(+), 1 deletion(-)
diff --git a/drivers/pci/controller/dwc/pcie-fu740.c b/drivers/pci/controller/dwc/pcie-fu740.c
index 00cde9a248b5..78d002be4f82 100644
--- a/drivers/pci/controller/dwc/pcie-fu740.c
+++ b/drivers/pci/controller/dwc/pcie-fu740.c
@@ -181,10 +181,59 @@ static int fu740_pcie_start_link(struct dw_pcie *pci)
{
struct device *dev = pci->dev;
struct fu740_pcie *afp = dev_get_drvdata(dev);
+ u8 cap_exp = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+ int ret;
+ u32 orig, tmp;
+
+ /*
+ * Force 2.5GT/s when starting the link, due to some devices not
+ * probing at higher speeds. This happens with the PCIe switch
+ * on the Unmatched board when U-Boot has not initialised the PCIe.
+ * The fix in U-Boot is to force 2.5GT/s, which then gets cleared
+ * by the soft reset done by this driver.
+ */
+ dev_dbg(dev, "cap_exp at %x\n", cap_exp);
+ dw_pcie_dbi_ro_wr_en(pci);
+
+ tmp = dw_pcie_readl_dbi(pci, cap_exp + PCI_EXP_LNKCAP);
+ orig = tmp & PCI_EXP_LNKCAP_SLS;
+ tmp &= ~PCI_EXP_LNKCAP_SLS;
+ tmp |= PCI_EXP_LNKCAP_SLS_2_5GB;
+ dw_pcie_writel_dbi(pci, cap_exp + PCI_EXP_LNKCAP, tmp);
/* Enable LTSSM */
writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_APP_LTSSM_ENABLE);
- return 0;
+
+ ret = dw_pcie_wait_for_link(pci);
+ if (ret) {
+ dev_err(dev, "error: link did not start\n");
+ goto err;
+ }
+
+ tmp = dw_pcie_readl_dbi(pci, cap_exp + PCI_EXP_LNKCAP);
+ if ((tmp & PCI_EXP_LNKCAP_SLS) != orig) {
+ dev_dbg(dev, "changing speed back to original\n");
+
+ tmp &= ~PCI_EXP_LNKCAP_SLS;
+ tmp |= orig;
+ dw_pcie_writel_dbi(pci, cap_exp + PCI_EXP_LNKCAP, tmp);
+
+ tmp = dw_pcie_readl_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL);
+ tmp |= PORT_LOGIC_SPEED_CHANGE;
+ dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, tmp);
+
+ ret = dw_pcie_wait_for_link(pci);
+ if (ret) {
+ dev_err(dev, "error: link did not start at new speed\n");
+ goto err;
+ }
+ }
+
+ ret = 0;
+err:
+ WARN_ON(ret); /* we assume that errors will be very rare */
+ dw_pcie_dbi_ro_wr_dis(pci);
+ return ret;
}
static int fu740_pcie_host_init(struct pcie_port *pp)
--
2.32.0
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Date: Thu, 24 Mar 2022 18:14:09 -0700
Subject: [PATCH] mm: fix race between MADV_FREE reclaim and blkdev direct IO
read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/rmap.c b/mm/rmap.c
index bfcc8e3d412f..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Date: Thu, 24 Mar 2022 18:14:09 -0700
Subject: [PATCH] mm: fix race between MADV_FREE reclaim and blkdev direct IO
read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/rmap.c b/mm/rmap.c
index bfcc8e3d412f..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Date: Thu, 24 Mar 2022 18:14:09 -0700
Subject: [PATCH] mm: fix race between MADV_FREE reclaim and blkdev direct IO
read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/rmap.c b/mm/rmap.c
index bfcc8e3d412f..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Date: Thu, 24 Mar 2022 18:14:09 -0700
Subject: [PATCH] mm: fix race between MADV_FREE reclaim and blkdev direct IO
read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/rmap.c b/mm/rmap.c
index bfcc8e3d412f..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Date: Thu, 24 Mar 2022 18:14:09 -0700
Subject: [PATCH] mm: fix race between MADV_FREE reclaim and blkdev direct IO
read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/rmap.c b/mm/rmap.c
index bfcc8e3d412f..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Date: Thu, 24 Mar 2022 18:14:09 -0700
Subject: [PATCH] mm: fix race between MADV_FREE reclaim and blkdev direct IO
read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/rmap.c b/mm/rmap.c
index bfcc8e3d412f..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
The patch below does not apply to the 5.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6c8e2a256915a223f6289f651d6b926cd7135c9e Mon Sep 17 00:00:00 2001
From: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Date: Thu, 24 Mar 2022 18:14:09 -0700
Subject: [PATCH] mm: fix race between MADV_FREE reclaim and blkdev direct IO
read
Problem:
=======
Userspace might read the zero-page instead of actual data from a direct IO
read on a block device if the buffers have been called madvise(MADV_FREE)
on earlier (this is discussed below) due to a race between page reclaim on
MADV_FREE and blkdev direct IO read.
- Race condition:
==============
During page reclaim, the MADV_FREE page check in try_to_unmap_one() checks
if the page is not dirty, then discards its rmap PTE(s) (vs. remap back
if the page is dirty).
However, after try_to_unmap_one() returns to shrink_page_list(), it might
keep the page _anyway_ if page_ref_freeze() fails (it expects exactly
_one_ page reference, from the isolation for page reclaim).
Well, blkdev_direct_IO() gets references for all pages, and on READ
operations it only sets them dirty _later_.
So, if MADV_FREE'd pages (i.e., not dirty) are used as buffers for direct
IO read from block devices, and page reclaim happens during
__blkdev_direct_IO[_simple]() exactly AFTER bio_iov_iter_get_pages()
returns, but BEFORE the pages are set dirty, the situation happens.
The direct IO read eventually completes. Now, when userspace reads the
buffers, the PTE is no longer there and the page fault handler
do_anonymous_page() services that with the zero-page, NOT the data!
A synthetic reproducer is provided.
- Page faults:
===========
If page reclaim happens BEFORE bio_iov_iter_get_pages() the issue doesn't
happen, because that faults-in all pages as writeable, so
do_anonymous_page() sets up a new page/rmap/PTE, and that is used by
direct IO. The userspace reads don't fault as the PTE is there (thus
zero-page is not used/setup).
But if page reclaim happens AFTER it / BEFORE setting pages dirty, the PTE
is no longer there; the subsequent page faults can't help:
The data-read from the block device probably won't generate faults due to
DMA (no MMU) but even in the case it wouldn't use DMA, that happens on
different virtual addresses (not user-mapped addresses) because `struct
bio_vec` stores `struct page` to figure addresses out (which are different
from user-mapped addresses) for the read.
Thus userspace reads (to user-mapped addresses) still fault, then
do_anonymous_page() gets another `struct page` that would address/ map to
other memory than the `struct page` used by `struct bio_vec` for the read.
(The original `struct page` is not available, since it wasn't freed, as
page_ref_freeze() failed due to more page refs. And even if it were
available, its data cannot be trusted anymore.)
Solution:
========
One solution is to check for the expected page reference count in
try_to_unmap_one().
There should be one reference from the isolation (that is also checked in
shrink_page_list() with page_ref_freeze()) plus one or more references
from page mapping(s) (put in discard: label). Further references mean
that rmap/PTE cannot be unmapped/nuked.
(Note: there might be more than one reference from mapping due to
fork()/clone() without CLONE_VM, which use the same `struct page` for
references, until the copy-on-write page gets copied.)
So, additional page references (e.g., from direct IO read) now prevent the
rmap/PTE from being unmapped/dropped; similarly to the page is not freed
per shrink_page_list()/page_ref_freeze()).
- Races and Barriers:
==================
The new check in try_to_unmap_one() should be safe in races with
bio_iov_iter_get_pages() in get_user_pages() fast and slow paths, as it's
done under the PTE lock.
The fast path doesn't take the lock, but it checks if the PTE has changed
and if so, it drops the reference and leaves the page for the slow path
(which does take that lock).
The fast path requires synchronization w/ full memory barrier: it writes
the page reference count first then it reads the PTE later, while
try_to_unmap() writes PTE first then it reads page refcount.
And a second barrier is needed, as the page dirty flag should not be read
before the page reference count (as in __remove_mapping()). (This can be
a load memory barrier only; no writes are involved.)
Call stack/comments:
- try_to_unmap_one()
- page_vma_mapped_walk()
- map_pte() # see pte_offset_map_lock():
pte_offset_map()
spin_lock()
- ptep_get_and_clear() # write PTE
- smp_mb() # (new barrier) GUP fast path
- page_ref_count() # (new check) read refcount
- page_vma_mapped_walk_done() # see pte_unmap_unlock():
pte_unmap()
spin_unlock()
- bio_iov_iter_get_pages()
- __bio_iov_iter_get_pages()
- iov_iter_get_pages()
- get_user_pages_fast()
- internal_get_user_pages_fast()
# fast path
- lockless_pages_from_mm()
- gup_{pgd,p4d,pud,pmd,pte}_range()
ptep = pte_offset_map() # not _lock()
pte = ptep_get_lockless(ptep)
page = pte_page(pte)
try_grab_compound_head(page) # inc refcount
# (RMW/barrier
# on success)
if (pte_val(pte) != pte_val(*ptep)) # read PTE
put_compound_head(page) # dec refcount
# go slow path
# slow path
- __gup_longterm_unlocked()
- get_user_pages_unlocked()
- __get_user_pages_locked()
- __get_user_pages()
- follow_{page,p4d,pud,pmd}_mask()
- follow_page_pte()
ptep = pte_offset_map_lock()
pte = *ptep
page = vm_normal_page(pte)
try_grab_page(page) # inc refcount
pte_unmap_unlock()
- Huge Pages:
==========
Regarding transparent hugepages, that logic shouldn't change, as MADV_FREE
(aka lazyfree) pages are PageAnon() && !PageSwapBacked()
(madvise_free_pte_range() -> mark_page_lazyfree() -> lru_lazyfree_fn())
thus should reach shrink_page_list() -> split_huge_page_to_list() before
try_to_unmap[_one](), so it deals with normal pages only.
(And in case unlikely/TTU_SPLIT_HUGE_PMD/split_huge_pmd_address() happens,
which should not or be rare, the page refcount should be greater than
mapcount: the head page is referenced by tail pages. That also prevents
checking the head `page` then incorrectly call page_remove_rmap(subpage)
for a tail page, that isn't even in the shrink_page_list()'s page_list (an
effect of split huge pmd/pmvw), as it might happen today in this unlikely
scenario.)
MADV_FREE'd buffers:
===================
So, back to the "if MADV_FREE pages are used as buffers" note. The case
is arguable, and subject to multiple interpretations.
The madvise(2) manual page on the MADV_FREE advice value says:
1) 'After a successful MADV_FREE ... data will be lost when
the kernel frees the pages.'
2) 'the free operation will be canceled if the caller writes
into the page' / 'subsequent writes ... will succeed and
then [the] kernel cannot free those dirtied pages'
3) 'If there is no subsequent write, the kernel can free the
pages at any time.'
Thoughts, questions, considerations... respectively:
1) Since the kernel didn't actually free the page (page_ref_freeze()
failed), should the data not have been lost? (on userspace read.)
2) Should writes performed by the direct IO read be able to cancel
the free operation?
- Should the direct IO read be considered as 'the caller' too,
as it's been requested by 'the caller'?
- Should the bio technique to dirty pages on return to userspace
(bio_check_pages_dirty() is called/used by __blkdev_direct_IO())
be considered in another/special way here?
3) Should an upcoming write from a previously requested direct IO
read be considered as a subsequent write, so the kernel should
not free the pages? (as it's known at the time of page reclaim.)
And lastly:
Technically, the last point would seem a reasonable consideration and
balance, as the madvise(2) manual page apparently (and fairly) seem to
assume that 'writes' are memory access from the userspace process (not
explicitly considering writes from the kernel or its corner cases; again,
fairly).. plus the kernel fix implementation for the corner case of the
largely 'non-atomic write' encompassed by a direct IO read operation, is
relatively simple; and it helps.
Reproducer:
==========
@ test.c (simplified, but works)
#define _GNU_SOURCE
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
int main() {
int fd, i;
char *buf;
fd = open(DEV, O_RDONLY | O_DIRECT);
buf = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
buf[i] = 1; // init to non-zero
madvise(buf, BUF_SIZE, MADV_FREE);
read(fd, buf, BUF_SIZE);
for (i = 0; i < BUF_SIZE; i += PAGE_SIZE)
printf("%p: 0x%x\n", &buf[i], buf[i]);
return 0;
}
@ block/fops.c (formerly fs/block_dev.c)
+#include <linux/swap.h>
...
... __blkdev_direct_IO[_simple](...)
{
...
+ if (!strcmp(current->comm, "good"))
+ shrink_all_memory(ULONG_MAX);
+
ret = bio_iov_iter_get_pages(...);
+
+ if (!strcmp(current->comm, "bad"))
+ shrink_all_memory(ULONG_MAX);
...
}
@ shell
# NUM_PAGES=4
# PAGE_SIZE=$(getconf PAGE_SIZE)
# yes | dd of=test.img bs=${PAGE_SIZE} count=${NUM_PAGES}
# DEV=$(losetup -f --show test.img)
# gcc -DDEV=\"$DEV\" \
-DBUF_SIZE=$((PAGE_SIZE * NUM_PAGES)) \
-DPAGE_SIZE=${PAGE_SIZE} \
test.c -o test
# od -tx1 $DEV
0000000 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a 79 0a
*
0040000
# mv test good
# ./good
0x7f7c10418000: 0x79
0x7f7c10419000: 0x79
0x7f7c1041a000: 0x79
0x7f7c1041b000: 0x79
# mv good bad
# ./bad
0x7fa1b8050000: 0x0
0x7fa1b8051000: 0x0
0x7fa1b8052000: 0x0
0x7fa1b8053000: 0x0
Note: the issue is consistent on v5.17-rc3, but it's intermittent with the
support of MADV_FREE on v4.5 (60%-70% error; needs swap). [wrap
do_direct_IO() in do_blockdev_direct_IO() @ fs/direct-io.c].
- v5.17-rc3:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x0
# free | grep Swap
Swap: 0 0 0
- v4.5:
# for i in {1..1000}; do ./good; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
# mv good bad
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
2702 0x0
1298 0x79
# swapoff -av
swapoff /swap
# for i in {1..1000}; do ./bad; done \
| cut -d: -f2 | sort | uniq -c
4000 0x79
Ceph/TCMalloc:
=============
For documentation purposes, the use case driving the analysis/fix is Ceph
on Ubuntu 18.04, as the TCMalloc library there still uses MADV_FREE to
release unused memory to the system from the mmap'ed page heap (might be
committed back/used again; it's not munmap'ed.) - PageHeap::DecommitSpan()
-> TCMalloc_SystemRelease() -> madvise() - PageHeap::CommitSpan() ->
TCMalloc_SystemCommit() -> do nothing.
Note: TCMalloc switched back to MADV_DONTNEED a few commits after the
release in Ubuntu 18.04 (google-perftools/gperftools 2.5), so the issue
just 'disappeared' on Ceph on later Ubuntu releases but is still present
in the kernel, and can be hit by other use cases.
The observed issue seems to be the old Ceph bug #22464 [1], where checksum
mismatches are observed (and instrumentation with buffer dumps shows
zero-pages read from mmap'ed/MADV_FREE'd page ranges).
The issue in Ceph was reasonably deemed a kernel bug (comment #50) and
mostly worked around with a retry mechanism, but other parts of Ceph could
still hit that (rocksdb). Anyway, it's less likely to be hit again as
TCMalloc switched out of MADV_FREE by default.
(Some kernel versions/reports from the Ceph bug, and relation with
the MADV_FREE introduction/changes; TCMalloc versions not checked.)
- 4.4 good
- 4.5 (madv_free: introduction)
- 4.9 bad
- 4.10 good? maybe a swapless system
- 4.12 (madv_free: no longer free instantly on swapless systems)
- 4.13 bad
[1] https://tracker.ceph.com/issues/22464
Thanks:
======
Several people contributed to analysis/discussions/tests/reproducers in
the first stages when drilling down on ceph/tcmalloc/linux kernel:
- Dan Hill
- Dan Streetman
- Dongdong Tao
- Gavin Guo
- Gerald Yang
- Heitor Alves de Siqueira
- Ioanna Alifieraki
- Jay Vosburgh
- Matthew Ruffell
- Ponnuvel Palaniyappan
Reviews, suggestions, corrections, comments:
- Minchan Kim
- Yu Zhao
- Huang, Ying
- John Hubbard
- Christoph Hellwig
[mfo(a)canonical.com: v4]
Link: https://lkml.kernel.org/r/20220209202659.183418-1-mfo@canonical.comLink: https://lkml.kernel.org/r/20220131230255.789059-1-mfo@canonical.com
Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Signed-off-by: Mauricio Faria de Oliveira <mfo(a)canonical.com>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: Miaohe Lin <linmiaohe(a)huawei.com>
Cc: Dan Hill <daniel.hill(a)canonical.com>
Cc: Dan Streetman <dan.streetman(a)canonical.com>
Cc: Dongdong Tao <dongdong.tao(a)canonical.com>
Cc: Gavin Guo <gavin.guo(a)canonical.com>
Cc: Gerald Yang <gerald.yang(a)canonical.com>
Cc: Heitor Alves de Siqueira <halves(a)canonical.com>
Cc: Ioanna Alifieraki <ioanna-maria.alifieraki(a)canonical.com>
Cc: Jay Vosburgh <jay.vosburgh(a)canonical.com>
Cc: Matthew Ruffell <matthew.ruffell(a)canonical.com>
Cc: Ponnuvel Palaniyappan <ponnuvel.palaniyappan(a)canonical.com>
Cc: <stable(a)vger.kernel.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/rmap.c b/mm/rmap.c
index bfcc8e3d412f..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1588,7 +1588,30 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!folio_test_swapbacked(folio)) {
- if (!folio_test_dirty(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 734c15700cdf9062ae98d8b131c6fe873dfad26d Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador(a)suse.de>
Date: Tue, 22 Mar 2022 14:47:37 -0700
Subject: [PATCH] mm: only re-generate demotion targets when a numa node
changes its N_CPU state
Abhishek reported that after patch [1], hotplug operations are taking
roughly double the expected time. [2]
The reason behind is that the CPU callbacks that
migrate_on_reclaim_init() sets always call set_migration_target_nodes()
whenever a CPU is brought up/down.
But we only care about numa nodes going from having cpus to become
cpuless, and vice versa, as that influences the demotion_target order.
We do already have two CPU callbacks (vmstat_cpu_online() and
vmstat_cpu_dead()) that check exactly that, so get rid of the CPU
callbacks in migrate_on_reclaim_init() and only call
set_migration_target_nodes() from vmstat_cpu_{dead,online}() whenever a
numa node change its N_CPU state.
[1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.…
[2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux…
[osalvador(a)suse.de: add feedback from Huang Ying]
Link: https://lkml.kernel.org/r/20220314150945.12694-1-osalvador@suse.de
Link: https://lkml.kernel.org/r/20220310120749.23077-1-osalvador@suse.de
Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
Signed-off-by: Oscar Salvador <osalvador(a)suse.de>
Reviewed-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Reported-by: Abhishek Goel <huntbag(a)linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Abhishek Goel <huntbag(a)linux.vnet.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index db96e10eb8da..90e75d5a54d6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count);
extern bool numa_demotion_enabled;
+extern void migrate_on_reclaim_init(void);
+#ifdef CONFIG_HOTPLUG_CPU
+extern void set_migration_target_nodes(void);
#else
+static inline void set_migration_target_nodes(void) {}
+#endif
+#else
+
+static inline void set_migration_target_nodes(void) {}
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t new,
diff --git a/mm/migrate.c b/mm/migrate.c
index 78b2cf87946d..bc9da3fd01aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3209,7 +3209,7 @@ static void __set_migration_target_nodes(void)
/*
* For callers that do not hold get_online_mems() already.
*/
-static void set_migration_target_nodes(void)
+void set_migration_target_nodes(void)
{
get_online_mems();
__set_migration_target_nodes();
@@ -3273,51 +3273,24 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
-/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs. That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
+void __init migrate_on_reclaim_init(void)
{
- set_migration_target_nodes();
- return 0;
-}
-
-static int __init migrate_on_reclaim_init(void)
-{
- int ret;
-
node_demotion = kmalloc_array(nr_node_ids,
sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
- ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
- NULL, migration_offline_cpu);
+ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
/*
- * In the unlikely case that this fails, the automatic
- * migration targets may become suboptimal for nodes
- * where N_CPU changes. With such a small impact in a
- * rare case, do not bother trying to do anything special.
+ * At this point, all numa nodes with memory/CPus have their state
+ * properly set, so we can build the demotion order now.
+ * Let us hold the cpu_hotplug lock just, as we could possibily have
+ * CPU hotplug events during boot.
*/
- WARN_ON(ret < 0);
- ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
- migration_online_cpu, NULL);
- WARN_ON(ret < 0);
-
- hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
- return 0;
+ cpus_read_lock();
+ set_migration_target_nodes();
+ cpus_read_unlock();
}
-late_initcall(migrate_on_reclaim_init);
#endif /* CONFIG_HOTPLUG_CPU */
bool numa_demotion_enabled = false;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d5cc8d739fac..b75b1a64b54c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/migrate.h>
#include "internal.h"
@@ -2049,7 +2050,12 @@ static void __init init_cpu_node_state(void)
static int vmstat_cpu_online(unsigned int cpu)
{
refresh_zone_stat_thresholds();
- node_set_state(cpu_to_node(cpu), N_CPU);
+
+ if (!node_state(cpu_to_node(cpu), N_CPU)) {
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ set_migration_target_nodes();
+ }
+
return 0;
}
@@ -2072,6 +2078,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
+ set_migration_target_nodes();
+
return 0;
}
@@ -2103,6 +2111,9 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
+#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
+ migrate_on_reclaim_init();
+#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 734c15700cdf9062ae98d8b131c6fe873dfad26d Mon Sep 17 00:00:00 2001
From: Oscar Salvador <osalvador(a)suse.de>
Date: Tue, 22 Mar 2022 14:47:37 -0700
Subject: [PATCH] mm: only re-generate demotion targets when a numa node
changes its N_CPU state
Abhishek reported that after patch [1], hotplug operations are taking
roughly double the expected time. [2]
The reason behind is that the CPU callbacks that
migrate_on_reclaim_init() sets always call set_migration_target_nodes()
whenever a CPU is brought up/down.
But we only care about numa nodes going from having cpus to become
cpuless, and vice versa, as that influences the demotion_target order.
We do already have two CPU callbacks (vmstat_cpu_online() and
vmstat_cpu_dead()) that check exactly that, so get rid of the CPU
callbacks in migrate_on_reclaim_init() and only call
set_migration_target_nodes() from vmstat_cpu_{dead,online}() whenever a
numa node change its N_CPU state.
[1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.…
[2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux…
[osalvador(a)suse.de: add feedback from Huang Ying]
Link: https://lkml.kernel.org/r/20220314150945.12694-1-osalvador@suse.de
Link: https://lkml.kernel.org/r/20220310120749.23077-1-osalvador@suse.de
Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
Signed-off-by: Oscar Salvador <osalvador(a)suse.de>
Reviewed-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Reported-by: Abhishek Goel <huntbag(a)linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Abhishek Goel <huntbag(a)linux.vnet.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index db96e10eb8da..90e75d5a54d6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count);
extern bool numa_demotion_enabled;
+extern void migrate_on_reclaim_init(void);
+#ifdef CONFIG_HOTPLUG_CPU
+extern void set_migration_target_nodes(void);
#else
+static inline void set_migration_target_nodes(void) {}
+#endif
+#else
+
+static inline void set_migration_target_nodes(void) {}
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t new,
diff --git a/mm/migrate.c b/mm/migrate.c
index 78b2cf87946d..bc9da3fd01aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3209,7 +3209,7 @@ static void __set_migration_target_nodes(void)
/*
* For callers that do not hold get_online_mems() already.
*/
-static void set_migration_target_nodes(void)
+void set_migration_target_nodes(void)
{
get_online_mems();
__set_migration_target_nodes();
@@ -3273,51 +3273,24 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
-/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs. That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
+void __init migrate_on_reclaim_init(void)
{
- set_migration_target_nodes();
- return 0;
-}
-
-static int __init migrate_on_reclaim_init(void)
-{
- int ret;
-
node_demotion = kmalloc_array(nr_node_ids,
sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
- ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
- NULL, migration_offline_cpu);
+ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
/*
- * In the unlikely case that this fails, the automatic
- * migration targets may become suboptimal for nodes
- * where N_CPU changes. With such a small impact in a
- * rare case, do not bother trying to do anything special.
+ * At this point, all numa nodes with memory/CPus have their state
+ * properly set, so we can build the demotion order now.
+ * Let us hold the cpu_hotplug lock just, as we could possibily have
+ * CPU hotplug events during boot.
*/
- WARN_ON(ret < 0);
- ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
- migration_online_cpu, NULL);
- WARN_ON(ret < 0);
-
- hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
- return 0;
+ cpus_read_lock();
+ set_migration_target_nodes();
+ cpus_read_unlock();
}
-late_initcall(migrate_on_reclaim_init);
#endif /* CONFIG_HOTPLUG_CPU */
bool numa_demotion_enabled = false;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d5cc8d739fac..b75b1a64b54c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/migrate.h>
#include "internal.h"
@@ -2049,7 +2050,12 @@ static void __init init_cpu_node_state(void)
static int vmstat_cpu_online(unsigned int cpu)
{
refresh_zone_stat_thresholds();
- node_set_state(cpu_to_node(cpu), N_CPU);
+
+ if (!node_state(cpu_to_node(cpu), N_CPU)) {
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ set_migration_target_nodes();
+ }
+
return 0;
}
@@ -2072,6 +2078,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
+ set_migration_target_nodes();
+
return 0;
}
@@ -2103,6 +2111,9 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
+#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
+ migrate_on_reclaim_init();
+#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
Hi Greg,
Fedora 36 has switched from efifb to simpledrm as the pre-native-GPU driver
fb provider and the lack of this patch is causing issues for devices with
non up-right mounted LCD panels (1), can you please add this to the 5.17 stable
series?
Regards,
Hans
1) https://bugzilla.redhat.com/show_bug.cgi?id=2071134
Hans de Goede (1):
drm/simpledrm: Add "panel orientation" property on non-upright mounted
LCD panels
drivers/gpu/drm/tiny/simpledrm.c | 3 +++
1 file changed, 3 insertions(+)
--
2.35.1
I'm announcing the release of the 4.14.275 kernel.
All users of the 4.14 kernel series must upgrade.
The updated 4.14.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git linux-4.14.y
and can be browsed at the normal kernel.org git web browser:
https://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git;a=summary
thanks,
greg k-h
------------
Documentation/arm64/silicon-errata.txt | 1
Makefile | 2
arch/arm/include/asm/kvm_host.h | 6
arch/arm64/Kconfig | 24 ++
arch/arm64/include/asm/assembler.h | 34 ++
arch/arm64/include/asm/cpu.h | 1
arch/arm64/include/asm/cpucaps.h | 4
arch/arm64/include/asm/cpufeature.h | 39 +++
arch/arm64/include/asm/cputype.h | 20 +
arch/arm64/include/asm/fixmap.h | 6
arch/arm64/include/asm/kvm_host.h | 5
arch/arm64/include/asm/kvm_mmu.h | 2
arch/arm64/include/asm/mmu.h | 8
arch/arm64/include/asm/sections.h | 6
arch/arm64/include/asm/sysreg.h | 5
arch/arm64/include/asm/vectors.h | 74 ++++++
arch/arm64/kernel/bpi.S | 55 ++++
arch/arm64/kernel/cpu_errata.c | 395 ++++++++++++++++++++++++++++++++-
arch/arm64/kernel/cpufeature.c | 21 +
arch/arm64/kernel/cpuinfo.c | 1
arch/arm64/kernel/entry.S | 196 +++++++++++++---
arch/arm64/kernel/vmlinux.lds.S | 2
arch/arm64/kvm/hyp/hyp-entry.S | 4
arch/arm64/kvm/hyp/switch.c | 9
arch/arm64/mm/mmu.c | 11
drivers/clocksource/arm_arch_timer.c | 15 +
include/linux/arm-smccc.h | 7
virt/kvm/arm/psci.c | 12 +
28 files changed, 908 insertions(+), 57 deletions(-)
Anshuman Khandual (1):
arm64: Add Cortex-X2 CPU part definition
Arnd Bergmann (1):
arm64: arch_timer: avoid unused function warning
Greg Kroah-Hartman (1):
Linux 4.14.275
James Morse (19):
arm64: entry.S: Add ventry overflow sanity checks
arm64: entry: Make the trampoline cleanup optional
arm64: entry: Free up another register on kpti's tramp_exit path
arm64: entry: Move the trampoline data page before the text page
arm64: entry: Allow tramp_alias to access symbols after the 4K boundary
arm64: entry: Don't assume tramp_vectors is the start of the vectors
arm64: entry: Move trampoline macros out of ifdef'd section
arm64: entry: Make the kpti trampoline's kpti sequence optional
arm64: entry: Allow the trampoline text to occupy multiple pages
arm64: entry: Add non-kpti __bp_harden_el1_vectors for mitigations
arm64: entry: Add vectors that have the bhb mitigation sequences
arm64: entry: Add macro for reading symbol addresses from the trampoline
arm64: Add percpu vectors for EL1
arm64: proton-pack: Report Spectre-BHB vulnerabilities as part of Spectre-v2
KVM: arm64: Add templates for BHB mitigation sequences
arm64: Mitigate spectre style branch history side channels
KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated
arm64: add ID_AA64ISAR2_EL1 sys register
arm64: Use the clearbhb instruction in mitigations
Marc Zyngier (4):
arm64: arch_timer: Add workaround for ARM erratum 1188873
arm64: Add silicon-errata.txt entry for ARM erratum 1188873
arm64: Make ARM64_ERRATUM_1188873 depend on COMPAT
arm64: Add part number for Neoverse N1
Rob Herring (1):
arm64: Add part number for Arm Cortex-A77
Suzuki K Poulose (1):
arm64: Add Neoverse-N2, Cortex-A710 CPU part definition
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7ba89d2af17aa879dda30f5d5d3f152e587fc551 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Wed, 23 Mar 2022 09:32:35 -0600
Subject: [PATCH] io_uring: ensure recv and recvmsg handle MSG_WAITALL
correctly
We currently don't attempt to get the full asked for length even if
MSG_WAITALL is set, if we get a partial receive. If we do see a partial
receive, then just note how many bytes we did and return -EAGAIN to
get it retried.
The iov is advanced appropriately for the vector based case, and we
manually bump the buffer and remainder for the non-vector case.
Cc: stable(a)vger.kernel.org
Reported-by: Constantine Gavrilov <constantine.gavrilov(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f41d91ce1fd0..a70de170aea1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -612,6 +612,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
+ size_t done_io;
};
struct io_open {
@@ -5417,12 +5418,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
@@ -5465,6 +5475,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ return io_setup_async_msg(req, kmsg);
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
req_set_fail(req);
@@ -5474,6 +5488,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5524,12 +5542,22 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ return -EAGAIN;
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7ba89d2af17aa879dda30f5d5d3f152e587fc551 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Wed, 23 Mar 2022 09:32:35 -0600
Subject: [PATCH] io_uring: ensure recv and recvmsg handle MSG_WAITALL
correctly
We currently don't attempt to get the full asked for length even if
MSG_WAITALL is set, if we get a partial receive. If we do see a partial
receive, then just note how many bytes we did and return -EAGAIN to
get it retried.
The iov is advanced appropriately for the vector based case, and we
manually bump the buffer and remainder for the non-vector case.
Cc: stable(a)vger.kernel.org
Reported-by: Constantine Gavrilov <constantine.gavrilov(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f41d91ce1fd0..a70de170aea1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -612,6 +612,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
+ size_t done_io;
};
struct io_open {
@@ -5417,12 +5418,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
@@ -5465,6 +5475,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ return io_setup_async_msg(req, kmsg);
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
req_set_fail(req);
@@ -5474,6 +5488,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5524,12 +5542,22 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ return -EAGAIN;
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7ba89d2af17aa879dda30f5d5d3f152e587fc551 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Wed, 23 Mar 2022 09:32:35 -0600
Subject: [PATCH] io_uring: ensure recv and recvmsg handle MSG_WAITALL
correctly
We currently don't attempt to get the full asked for length even if
MSG_WAITALL is set, if we get a partial receive. If we do see a partial
receive, then just note how many bytes we did and return -EAGAIN to
get it retried.
The iov is advanced appropriately for the vector based case, and we
manually bump the buffer and remainder for the non-vector case.
Cc: stable(a)vger.kernel.org
Reported-by: Constantine Gavrilov <constantine.gavrilov(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f41d91ce1fd0..a70de170aea1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -612,6 +612,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
+ size_t done_io;
};
struct io_open {
@@ -5417,12 +5418,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
@@ -5465,6 +5475,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ return io_setup_async_msg(req, kmsg);
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
req_set_fail(req);
@@ -5474,6 +5488,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5524,12 +5542,22 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ return -EAGAIN;
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
The patch below does not apply to the 5.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 7ba89d2af17aa879dda30f5d5d3f152e587fc551 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Wed, 23 Mar 2022 09:32:35 -0600
Subject: [PATCH] io_uring: ensure recv and recvmsg handle MSG_WAITALL
correctly
We currently don't attempt to get the full asked for length even if
MSG_WAITALL is set, if we get a partial receive. If we do see a partial
receive, then just note how many bytes we did and return -EAGAIN to
get it retried.
The iov is advanced appropriately for the vector based case, and we
manually bump the buffer and remainder for the non-vector case.
Cc: stable(a)vger.kernel.org
Reported-by: Constantine Gavrilov <constantine.gavrilov(a)gmail.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f41d91ce1fd0..a70de170aea1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -612,6 +612,7 @@ struct io_sr_msg {
int msg_flags;
int bgid;
size_t len;
+ size_t done_io;
};
struct io_open {
@@ -5417,12 +5418,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
+ sr->done_io = 0;
return 0;
}
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_async_msghdr iomsg, *kmsg;
+ struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
struct io_buffer *kbuf;
unsigned flags;
@@ -5465,6 +5475,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_msg(req, kmsg);
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ return io_setup_async_msg(req, kmsg);
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
req_set_fail(req);
@@ -5474,6 +5488,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5524,12 +5542,22 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ return -EAGAIN;
+ }
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
out_free:
req_set_fail(req);
}
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
__io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
From: Rik van Riel <riel(a)surriel.com>
Subject: mm,hwpoison: unmap poisoned page before invalidation
In some cases it appears the invalidation of a hwpoisoned page fails
because the page is still mapped in another process. This can cause a
program to be continuously restarted and die when it page faults on the
page that was not invalidated. Avoid that problem by unmapping the
hwpoisoned page when we find it.
Another issue is that sometimes we end up oopsing in finish_fault, if the
code tries to do something with the now-NULL vmf->page. I did not hit
this error when submitting the previous patch because there are several
opportunities for alloc_set_pte to bail out before accessing vmf->page,
and that apparently happened on those systems, and most of the time on
other systems, too.
However, across several million systems that error does occur a handful of
times a day. It can be avoided by returning VM_FAULT_NOPAGE which will
cause do_read_fault to return before calling finish_fault.
Link: https://lkml.kernel.org/r/20220325161428.5068d97e@imladris.surriel.com
Fixes: e53ac7374e64 ("mm: invalidate hwpoison page cache page in fault path")
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Reviewed-by: Miaohe Lin <linmiaohe(a)huawei.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
--- a/mm/memory.c~mmhwpoison-unmap-poisoned-page-before-invalidation
+++ a/mm/memory.c
@@ -3918,14 +3918,18 @@ static vm_fault_t __do_fault(struct vm_f
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
+ struct page *page = vmf->page;
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
+ if (page_mapped(page))
+ unmap_mapping_pages(page_mapping(page),
+ page->index, 1, false);
/* Retry if a clean page was removed from the cache. */
- if (invalidate_inode_page(vmf->page))
- poisonret = 0;
- unlock_page(vmf->page);
+ if (invalidate_inode_page(page))
+ poisonret = VM_FAULT_NOPAGE;
+ unlock_page(page);
}
- put_page(vmf->page);
+ put_page(page);
vmf->page = NULL;
return poisonret;
}
_
From: Charan Teja Kalla <quic_charante(a)quicinc.com>
Subject: Revert "mm: madvise: skip unmapped vma holes passed to process_madvise"
This reverts commit 08095d6310a7 ("mm: madvise: skip unmapped vma holes
passed to process_madvise") as process_madvise() fails to return the exact
processed bytes in other cases too. As an example: if process_madvise()
hits mlocked pages after processing some initial bytes passed in [start,
end), it just returns EINVAL although some bytes are processed. Thus
making an exception only for ENOMEM is partially fixing the problem of
returning the proper advised bytes.
Thus revert this patch and return proper bytes advised.
Link: https://lkml.kernel.org/r/e73da1304a88b6a8a11907045117cccf4c2b8374.16480466…
Fixes: 08095d6310a7ce ("mm: madvise: skip unmapped vma holes passed to process_madvise")
Signed-off-by: Charan Teja Kalla <quic_charante(a)quicinc.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/madvise.c | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
--- a/mm/madvise.c~revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise
+++ a/mm/madvise.c
@@ -1464,16 +1464,9 @@ SYSCALL_DEFINE5(process_madvise, int, pi
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
- /*
- * do_madvise returns ENOMEM if unmapped holes are present
- * in the passed VMA. process_madvise() is expected to skip
- * unmapped holes passed to it in the 'struct iovec' list
- * and not fail because of them. Thus treat -ENOMEM return
- * from do_madvise as valid and continue processing.
- */
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
iovec.iov_len, behavior);
- if (ret < 0 && ret != -ENOMEM)
+ if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
_
This is the start of the stable review cycle for the 4.14.275 release.
There are 27 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sun, 03 Apr 2022 06:36:16 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.275-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.14.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.14.275-rc1
James Morse <james.morse(a)arm.com>
arm64: Use the clearbhb instruction in mitigations
James Morse <james.morse(a)arm.com>
arm64: add ID_AA64ISAR2_EL1 sys register
James Morse <james.morse(a)arm.com>
KVM: arm64: Allow SMCCC_ARCH_WORKAROUND_3 to be discovered and migrated
James Morse <james.morse(a)arm.com>
arm64: Mitigate spectre style branch history side channels
James Morse <james.morse(a)arm.com>
KVM: arm64: Add templates for BHB mitigation sequences
James Morse <james.morse(a)arm.com>
arm64: proton-pack: Report Spectre-BHB vulnerabilities as part of Spectre-v2
James Morse <james.morse(a)arm.com>
arm64: Add percpu vectors for EL1
James Morse <james.morse(a)arm.com>
arm64: entry: Add macro for reading symbol addresses from the trampoline
James Morse <james.morse(a)arm.com>
arm64: entry: Add vectors that have the bhb mitigation sequences
James Morse <james.morse(a)arm.com>
arm64: entry: Add non-kpti __bp_harden_el1_vectors for mitigations
James Morse <james.morse(a)arm.com>
arm64: entry: Allow the trampoline text to occupy multiple pages
James Morse <james.morse(a)arm.com>
arm64: entry: Make the kpti trampoline's kpti sequence optional
James Morse <james.morse(a)arm.com>
arm64: entry: Move trampoline macros out of ifdef'd section
James Morse <james.morse(a)arm.com>
arm64: entry: Don't assume tramp_vectors is the start of the vectors
James Morse <james.morse(a)arm.com>
arm64: entry: Allow tramp_alias to access symbols after the 4K boundary
James Morse <james.morse(a)arm.com>
arm64: entry: Move the trampoline data page before the text page
James Morse <james.morse(a)arm.com>
arm64: entry: Free up another register on kpti's tramp_exit path
James Morse <james.morse(a)arm.com>
arm64: entry: Make the trampoline cleanup optional
James Morse <james.morse(a)arm.com>
arm64: entry.S: Add ventry overflow sanity checks
Anshuman Khandual <anshuman.khandual(a)arm.com>
arm64: Add Cortex-X2 CPU part definition
Suzuki K Poulose <suzuki.poulose(a)arm.com>
arm64: Add Neoverse-N2, Cortex-A710 CPU part definition
Rob Herring <robh(a)kernel.org>
arm64: Add part number for Arm Cortex-A77
Marc Zyngier <marc.zyngier(a)arm.com>
arm64: Add part number for Neoverse N1
Marc Zyngier <marc.zyngier(a)arm.com>
arm64: Make ARM64_ERRATUM_1188873 depend on COMPAT
Marc Zyngier <marc.zyngier(a)arm.com>
arm64: Add silicon-errata.txt entry for ARM erratum 1188873
Arnd Bergmann <arnd(a)arndb.de>
arm64: arch_timer: avoid unused function warning
Marc Zyngier <marc.zyngier(a)arm.com>
arm64: arch_timer: Add workaround for ARM erratum 1188873
-------------
Diffstat:
Documentation/arm64/silicon-errata.txt | 1 +
Makefile | 4 +-
arch/arm/include/asm/kvm_host.h | 6 +
arch/arm64/Kconfig | 24 ++
arch/arm64/include/asm/assembler.h | 34 +++
arch/arm64/include/asm/cpu.h | 1 +
arch/arm64/include/asm/cpucaps.h | 4 +-
arch/arm64/include/asm/cpufeature.h | 39 ++++
arch/arm64/include/asm/cputype.h | 20 ++
arch/arm64/include/asm/fixmap.h | 6 +-
arch/arm64/include/asm/kvm_host.h | 5 +
arch/arm64/include/asm/kvm_mmu.h | 2 +-
arch/arm64/include/asm/mmu.h | 8 +-
arch/arm64/include/asm/sections.h | 6 +
arch/arm64/include/asm/sysreg.h | 5 +
arch/arm64/include/asm/vectors.h | 74 ++++++
arch/arm64/kernel/bpi.S | 55 +++++
arch/arm64/kernel/cpu_errata.c | 395 ++++++++++++++++++++++++++++++++-
arch/arm64/kernel/cpufeature.c | 21 ++
arch/arm64/kernel/cpuinfo.c | 1 +
arch/arm64/kernel/entry.S | 196 ++++++++++++----
arch/arm64/kernel/vmlinux.lds.S | 2 +-
arch/arm64/kvm/hyp/hyp-entry.S | 4 +
arch/arm64/kvm/hyp/switch.c | 9 +-
arch/arm64/mm/mmu.c | 11 +-
drivers/clocksource/arm_arch_timer.c | 15 ++
include/linux/arm-smccc.h | 7 +
virt/kvm/arm/psci.c | 12 +
28 files changed, 909 insertions(+), 58 deletions(-)
From: Rik van Riel <riel(a)surriel.com>
Subject: mm,hwpoison: unmap poisoned page before invalidation
In some cases it appears the invalidation of a hwpoisoned page fails
because the page is still mapped in another process. This can cause a
program to be continuously restarted and die when it page faults on the
page that was not invalidated. Avoid that problem by unmapping the
hwpoisoned page when we find it.
Another issue is that sometimes we end up oopsing in finish_fault, if the
code tries to do something with the now-NULL vmf->page. I did not hit
this error when submitting the previous patch because there are several
opportunities for alloc_set_pte to bail out before accessing vmf->page,
and that apparently happened on those systems, and most of the time on
other systems, too.
However, across several million systems that error does occur a handful of
times a day. It can be avoided by returning VM_FAULT_NOPAGE which will
cause do_read_fault to return before calling finish_fault.
Link: https://lkml.kernel.org/r/20220325161428.5068d97e@imladris.surriel.com
Fixes: e53ac7374e64 ("mm: invalidate hwpoison page cache page in fault path")
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Reviewed-by: Miaohe Lin <linmiaohe(a)huawei.com>
Tested-by: Naoya Horiguchi <naoya.horiguchi(a)nec.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
--- a/mm/memory.c~mmhwpoison-unmap-poisoned-page-before-invalidation
+++ a/mm/memory.c
@@ -3918,14 +3918,18 @@ static vm_fault_t __do_fault(struct vm_f
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
+ struct page *page = vmf->page;
vm_fault_t poisonret = VM_FAULT_HWPOISON;
if (ret & VM_FAULT_LOCKED) {
+ if (page_mapped(page))
+ unmap_mapping_pages(page_mapping(page),
+ page->index, 1, false);
/* Retry if a clean page was removed from the cache. */
- if (invalidate_inode_page(vmf->page))
- poisonret = 0;
- unlock_page(vmf->page);
+ if (invalidate_inode_page(page))
+ poisonret = VM_FAULT_NOPAGE;
+ unlock_page(page);
}
- put_page(vmf->page);
+ put_page(page);
vmf->page = NULL;
return poisonret;
}
_
From: Charan Teja Kalla <quic_charante(a)quicinc.com>
Subject: Revert "mm: madvise: skip unmapped vma holes passed to process_madvise"
This reverts commit 08095d6310a7 ("mm: madvise: skip unmapped vma holes
passed to process_madvise") as process_madvise() fails to return the exact
processed bytes in other cases too. As an example: if process_madvise()
hits mlocked pages after processing some initial bytes passed in [start,
end), it just returns EINVAL although some bytes are processed. Thus
making an exception only for ENOMEM is partially fixing the problem of
returning the proper advised bytes.
Thus revert this patch and return proper bytes advised.
Link: https://lkml.kernel.org/r/e73da1304a88b6a8a11907045117cccf4c2b8374.16480466…
Fixes: 08095d6310a7ce ("mm: madvise: skip unmapped vma holes passed to process_madvise")
Signed-off-by: Charan Teja Kalla <quic_charante(a)quicinc.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Nadav Amit <nadav.amit(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/madvise.c | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
--- a/mm/madvise.c~revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise
+++ a/mm/madvise.c
@@ -1464,16 +1464,9 @@ SYSCALL_DEFINE5(process_madvise, int, pi
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
- /*
- * do_madvise returns ENOMEM if unmapped holes are present
- * in the passed VMA. process_madvise() is expected to skip
- * unmapped holes passed to it in the 'struct iovec' list
- * and not fail because of them. Thus treat -ENOMEM return
- * from do_madvise as valid and continue processing.
- */
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
iovec.iov_len, behavior);
- if (ret < 0 && ret != -ENOMEM)
+ if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
_
From version 2.38, binutils default to ISA spec version 20191213. This
means that the csr read/write (csrr*/csrw*) instructions and fence.i
instruction has separated from the `I` extension, become two standalone
extensions: Zicsr and Zifencei. As the kernel uses those instruction,
this causes the following build failure:
CC arch/riscv/kernel/vdso/vgettimeofday.o
<<BUILDDIR>>/arch/riscv/include/asm/vdso/gettimeofday.h: Assembler messages:
<<BUILDDIR>>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01'
<<BUILDDIR>>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01'
<<BUILDDIR>>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01'
<<BUILDDIR>>/arch/riscv/include/asm/vdso/gettimeofday.h:71: Error: unrecognized opcode `csrr a5,0xc01'
The fix is to specify those extensions explicitely in -march. However as
older binutils version do not support this, we first need to detect
that.
Cc: stable(a)vger.kernel.org # 4.15+
Cc: Kito Cheng <kito.cheng(a)gmail.com>
Signed-off-by: Aurelien Jarno <aurelien(a)aurel32.net>
---
arch/riscv/Makefile | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 8a107ed18b0d..7d81102cffd4 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -50,6 +50,12 @@ riscv-march-$(CONFIG_ARCH_RV32I) := rv32ima
riscv-march-$(CONFIG_ARCH_RV64I) := rv64ima
riscv-march-$(CONFIG_FPU) := $(riscv-march-y)fd
riscv-march-$(CONFIG_RISCV_ISA_C) := $(riscv-march-y)c
+
+# Newer binutils versions default to ISA spec version 20191213 which moves some
+# instructions from the I extension to the Zicsr and Zifencei extensions.
+toolchain-need-zicsr-zifencei := $(call cc-option-yn, -march=$(riscv-march-y)_zicsr_zifencei)
+riscv-march-$(toolchain-need-zicsr-zifencei) := $(riscv-march-y)_zicsr_zifencei
+
KBUILD_CFLAGS += -march=$(subst fd,,$(riscv-march-y))
KBUILD_AFLAGS += -march=$(riscv-march-y)
--
2.34.1
The patch below does not apply to the 5.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3fd07aecb75003fbcb0b7c3124d12f71ffd360d8 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal(a)opensource.wdc.com>
Date: Tue, 1 Mar 2022 20:30:09 +0900
Subject: [PATCH] scsi: scsi_debug: Fix qc_lock use in sdebug_blk_mq_poll()
The use of the 'locked' boolean variable to control locking and unlocking
of the qc_lock spinlock of struct sdebug_queue confuses sparse, leading to
a warning about an unexpected unlock. Simplify the qc_lock lock/unlock
handling code of this function to avoid this warning by removing the
'locked' boolean variable. This change also fixes unlocked access to the
in_use_bm bitmap with the find_first_bit() function.
Link: https://lore.kernel.org/r/20220301113009.595857-3-damien.lemoal@opensource.…
Fixes: b05d4e481eff ("scsi: scsi_debug: Refine sdebug_blk_mq_poll()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal(a)opensource.wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index f4e97f2224b2..25fa8e93f5a8 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -7509,7 +7509,6 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
{
bool first;
bool retiring = false;
- bool locked = false;
int num_entries = 0;
unsigned int qc_idx = 0;
unsigned long iflags;
@@ -7525,11 +7524,9 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
if (qc_idx >= sdebug_max_queue)
return 0;
+ spin_lock_irqsave(&sqp->qc_lock, iflags);
+
for (first = true; first || qc_idx + 1 < sdebug_max_queue; ) {
- if (!locked) {
- spin_lock_irqsave(&sqp->qc_lock, iflags);
- locked = true;
- }
if (first) {
first = false;
if (!test_bit(qc_idx, sqp->in_use_bm))
@@ -7586,14 +7583,15 @@ static int sdebug_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num)
}
WRITE_ONCE(sd_dp->defer_t, SDEB_DEFER_NONE);
spin_unlock_irqrestore(&sqp->qc_lock, iflags);
- locked = false;
scsi_done(scp); /* callback to mid level */
num_entries++;
+ spin_lock_irqsave(&sqp->qc_lock, iflags);
if (find_first_bit(sqp->in_use_bm, sdebug_max_queue) >= sdebug_max_queue)
- break; /* if no more then exit without retaking spinlock */
+ break;
}
- if (locked)
- spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+
+ spin_unlock_irqrestore(&sqp->qc_lock, iflags);
+
if (num_entries > 0)
atomic_add(num_entries, &sdeb_mq_poll_count);
return num_entries;
From: Zekun Shen <bruceshenzk(a)gmail.com>
[ Upstream commit 564d4eceb97eaf381dd6ef6470b06377bb50c95a ]
The bug was found during fuzzing. Stacktrace locates it in
ath5k_eeprom_convert_pcal_info_5111.
When none of the curve is selected in the loop, idx can go
up to AR5K_EEPROM_N_PD_CURVES. The line makes pd out of bound.
pd = &chinfo[pier].pd_curves[idx];
There are many OOB writes using pd later in the code. So I
added a sanity check for idx. Checks for other loops involving
AR5K_EEPROM_N_PD_CURVES are not needed as the loop index is not
used outside the loops.
The patch is NOT tested with real device.
The following is the fuzzing report
BUG: KASAN: slab-out-of-bounds in ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
Write of size 1 at addr ffff8880174a4d60 by task modprobe/214
CPU: 0 PID: 214 Comm: modprobe Not tainted 5.6.0 #1
Call Trace:
dump_stack+0x76/0xa0
print_address_description.constprop.0+0x16/0x200
? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
__kasan_report.cold+0x37/0x7c
? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
kasan_report+0xe/0x20
ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
? apic_timer_interrupt+0xa/0x20
? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k]
? ath5k_pci_eeprom_read+0x228/0x3c0 [ath5k]
ath5k_eeprom_init+0x2513/0x6290 [ath5k]
? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k]
? usleep_range+0xb8/0x100
? apic_timer_interrupt+0xa/0x20
? ath5k_eeprom_read_pcal_info_2413+0x2f20/0x2f20 [ath5k]
ath5k_hw_init+0xb60/0x1970 [ath5k]
ath5k_init_ah+0x6fe/0x2530 [ath5k]
? kasprintf+0xa6/0xe0
? ath5k_stop+0x140/0x140 [ath5k]
? _dev_notice+0xf6/0xf6
? apic_timer_interrupt+0xa/0x20
ath5k_pci_probe.cold+0x29a/0x3d6 [ath5k]
? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k]
? mutex_lock+0x89/0xd0
? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k]
local_pci_probe+0xd3/0x160
pci_device_probe+0x23f/0x3e0
? pci_device_remove+0x280/0x280
? pci_device_remove+0x280/0x280
really_probe+0x209/0x5d0
Reported-by: Brendan Dolan-Gavitt <brendandg(a)nyu.edu>
Signed-off-by: Zekun Shen <bruceshenzk(a)gmail.com>
Signed-off-by: Kalle Valo <quic_kvalo(a)quicinc.com>
Link: https://lore.kernel.org/r/YckvDdj3mtCkDRIt@a-10-27-26-18.dynapool.vpn.nyu.e…
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/wireless/ath/ath5k/eeprom.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/wireless/ath/ath5k/eeprom.c b/drivers/net/wireless/ath/ath5k/eeprom.c
index 94d34ee02265..01163b333945 100644
--- a/drivers/net/wireless/ath/ath5k/eeprom.c
+++ b/drivers/net/wireless/ath/ath5k/eeprom.c
@@ -746,6 +746,9 @@ ath5k_eeprom_convert_pcal_info_5111(struct ath5k_hw *ah, int mode,
}
}
+ if (idx == AR5K_EEPROM_N_PD_CURVES)
+ goto err_out;
+
ee->ee_pd_gains[mode] = 1;
pd = &chinfo[pier].pd_curves[idx];
--
2.34.1
From: Zekun Shen <bruceshenzk(a)gmail.com>
[ Upstream commit 564d4eceb97eaf381dd6ef6470b06377bb50c95a ]
The bug was found during fuzzing. Stacktrace locates it in
ath5k_eeprom_convert_pcal_info_5111.
When none of the curve is selected in the loop, idx can go
up to AR5K_EEPROM_N_PD_CURVES. The line makes pd out of bound.
pd = &chinfo[pier].pd_curves[idx];
There are many OOB writes using pd later in the code. So I
added a sanity check for idx. Checks for other loops involving
AR5K_EEPROM_N_PD_CURVES are not needed as the loop index is not
used outside the loops.
The patch is NOT tested with real device.
The following is the fuzzing report
BUG: KASAN: slab-out-of-bounds in ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
Write of size 1 at addr ffff8880174a4d60 by task modprobe/214
CPU: 0 PID: 214 Comm: modprobe Not tainted 5.6.0 #1
Call Trace:
dump_stack+0x76/0xa0
print_address_description.constprop.0+0x16/0x200
? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
__kasan_report.cold+0x37/0x7c
? ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
kasan_report+0xe/0x20
ath5k_eeprom_read_pcal_info_5111+0x126a/0x1390 [ath5k]
? apic_timer_interrupt+0xa/0x20
? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k]
? ath5k_pci_eeprom_read+0x228/0x3c0 [ath5k]
ath5k_eeprom_init+0x2513/0x6290 [ath5k]
? ath5k_eeprom_init_11a_pcal_freq+0xbc0/0xbc0 [ath5k]
? usleep_range+0xb8/0x100
? apic_timer_interrupt+0xa/0x20
? ath5k_eeprom_read_pcal_info_2413+0x2f20/0x2f20 [ath5k]
ath5k_hw_init+0xb60/0x1970 [ath5k]
ath5k_init_ah+0x6fe/0x2530 [ath5k]
? kasprintf+0xa6/0xe0
? ath5k_stop+0x140/0x140 [ath5k]
? _dev_notice+0xf6/0xf6
? apic_timer_interrupt+0xa/0x20
ath5k_pci_probe.cold+0x29a/0x3d6 [ath5k]
? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k]
? mutex_lock+0x89/0xd0
? ath5k_pci_eeprom_read+0x3c0/0x3c0 [ath5k]
local_pci_probe+0xd3/0x160
pci_device_probe+0x23f/0x3e0
? pci_device_remove+0x280/0x280
? pci_device_remove+0x280/0x280
really_probe+0x209/0x5d0
Reported-by: Brendan Dolan-Gavitt <brendandg(a)nyu.edu>
Signed-off-by: Zekun Shen <bruceshenzk(a)gmail.com>
Signed-off-by: Kalle Valo <quic_kvalo(a)quicinc.com>
Link: https://lore.kernel.org/r/YckvDdj3mtCkDRIt@a-10-27-26-18.dynapool.vpn.nyu.e…
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/net/wireless/ath/ath5k/eeprom.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/wireless/ath/ath5k/eeprom.c b/drivers/net/wireless/ath/ath5k/eeprom.c
index 94d34ee02265..01163b333945 100644
--- a/drivers/net/wireless/ath/ath5k/eeprom.c
+++ b/drivers/net/wireless/ath/ath5k/eeprom.c
@@ -746,6 +746,9 @@ ath5k_eeprom_convert_pcal_info_5111(struct ath5k_hw *ah, int mode,
}
}
+ if (idx == AR5K_EEPROM_N_PD_CURVES)
+ goto err_out;
+
ee->ee_pd_gains[mode] = 1;
pd = &chinfo[pier].pd_curves[idx];
--
2.34.1
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2e8e79c416aae1de224c0f1860f2e3350fa171f8 Mon Sep 17 00:00:00 2001
From: Marc Kleine-Budde <mkl(a)pengutronix.de>
Date: Thu, 17 Mar 2022 08:57:35 +0100
Subject: [PATCH] can: m_can: m_can_tx_handler(): fix use after free of skb
can_put_echo_skb() will clone skb then free the skb. Move the
can_put_echo_skb() for the m_can version 3.0.x directly before the
start of the xmit in hardware, similar to the 3.1.x branch.
Fixes: 80646733f11c ("can: m_can: update to support CAN FD features")
Link: https://lore.kernel.org/all/20220317081305.739554-1-mkl@pengutronix.de
Cc: stable(a)vger.kernel.org
Reported-by: Hangyu Hua <hbh25y(a)gmail.com>
Signed-off-by: Marc Kleine-Budde <mkl(a)pengutronix.de>
diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 1a4b56f6fa8c..b3b5bc1c803b 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1637,8 +1637,6 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
if (err)
goto out_fail;
- can_put_echo_skb(skb, dev, 0, 0);
-
if (cdev->can.ctrlmode & CAN_CTRLMODE_FD) {
cccr = m_can_read(cdev, M_CAN_CCCR);
cccr &= ~CCCR_CMR_MASK;
@@ -1655,6 +1653,9 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev)
m_can_write(cdev, M_CAN_CCCR, cccr);
}
m_can_write(cdev, M_CAN_TXBTIE, 0x1);
+
+ can_put_echo_skb(skb, dev, 0, 0);
+
m_can_write(cdev, M_CAN_TXBAR, 0x1);
/* End of xmit function for version 3.0.x */
} else {
Hi Greg,
Sorry, please don't apply the patch to stable kernels. I tried to add
"linux,rs485-enabled-at-boot-time" support for this driver yesterday and
found this patch is not correct, I will send a patch to revert this
patch from the mainline kernel.
Thanks,
Hui.
On 4/1/22 18:38, gregkh(a)linuxfoundation.org wrote:
> This is a note to let you know that I've just added the patch titled
>
> serial: sc16is7xx: Clear RS485 bits in the shutdown
>
> to the 4.9-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> serial-sc16is7xx-clear-rs485-bits-in-the-shutdown.patch
> and it can be found in the queue-4.9 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
> >From 927728a34f11b5a27f4610bdb7068317d6fdc72a Mon Sep 17 00:00:00 2001
> From: Hui Wang <hui.wang(a)canonical.com>
> Date: Tue, 8 Mar 2022 19:00:42 +0800
> Subject: serial: sc16is7xx: Clear RS485 bits in the shutdown
>
> From: Hui Wang <hui.wang(a)canonical.com>
>
> commit 927728a34f11b5a27f4610bdb7068317d6fdc72a upstream.
>
> We tested RS485 function on an EVB which has SC16IS752, after
> finishing the test, we started the RS232 function test, but found the
> RTS is still working in the RS485 mode.
>
> That is because both startup and shutdown call port_update() to set
> the EFCR_REG, this will not clear the RS485 bits once the bits are set
> in the reconf_rs485(). To fix it, clear the RS485 bits in shutdown.
>
> Cc: <stable(a)vger.kernel.org>
> Signed-off-by: Hui Wang <hui.wang(a)canonical.com>
> Link: https://lore.kernel.org/r/20220308110042.108451-1-hui.wang@canonical.com
> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
> ---
> drivers/tty/serial/sc16is7xx.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> --- a/drivers/tty/serial/sc16is7xx.c
> +++ b/drivers/tty/serial/sc16is7xx.c
> @@ -1055,10 +1055,12 @@ static void sc16is7xx_shutdown(struct ua
>
> /* Disable all interrupts */
> sc16is7xx_port_write(port, SC16IS7XX_IER_REG, 0);
> - /* Disable TX/RX */
> + /* Disable TX/RX, clear auto RS485 and RTS invert */
> sc16is7xx_port_update(port, SC16IS7XX_EFCR_REG,
> SC16IS7XX_EFCR_RXDISABLE_BIT |
> - SC16IS7XX_EFCR_TXDISABLE_BIT,
> + SC16IS7XX_EFCR_TXDISABLE_BIT |
> + SC16IS7XX_EFCR_AUTO_RS485_BIT |
> + SC16IS7XX_EFCR_RTS_INVERT_BIT,
> SC16IS7XX_EFCR_RXDISABLE_BIT |
> SC16IS7XX_EFCR_TXDISABLE_BIT);
>
>
>
> Patches currently in stable-queue which might be from hui.wang(a)canonical.com are
>
> queue-4.9/serial-sc16is7xx-clear-rs485-bits-in-the-shutdown.patch
Commit ed8cc3b1fc84 ("PCI: qcom: Add support for SDM845 PCIe
controller") introduced a clock imbalance by enabling the pipe clock
both in init() and in post_init() but only disabling in post_deinit().
Note that the pipe clock was also never disabled in the init() error
paths and that enabling the clock before powering up the PHY looks
questionable.
Fixes: ed8cc3b1fc84 ("PCI: qcom: Add support for SDM845 PCIe controller")
Cc: stable(a)vger.kernel.org # 5.6
Cc: Bjorn Andersson <bjorn.andersson(a)linaro.org>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
Resending with lists on CC.
Johan
drivers/pci/controller/dwc/pcie-qcom.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
index b79d98e5e228..20a0e6533a1c 100644
--- a/drivers/pci/controller/dwc/pcie-qcom.c
+++ b/drivers/pci/controller/dwc/pcie-qcom.c
@@ -1238,12 +1238,6 @@ static int qcom_pcie_init_2_7_0(struct qcom_pcie *pcie)
goto err_disable_clocks;
}
- ret = clk_prepare_enable(res->pipe_clk);
- if (ret) {
- dev_err(dev, "cannot prepare/enable pipe clock\n");
- goto err_disable_clocks;
- }
-
/* Wait for reset to complete, required on SM8450 */
usleep_range(1000, 1500);
--
2.35.1
On Wed, Mar 30, 2022 at 04:54:53PM -0400, Joshua Freedman wrote:
> This felt really anti-climactic haha, but hopefully it's useful?
> cat bisect.log
> Bisecting: 81 revisions left to test after this (roughly 6 steps)
> [770aac3c84e0c83a19985413fa9fbfc126cc0ff6] net: mdio-ipq4019: add delay
> after clock enable
Wait, you still have more steps to go here. Did you test this kernel?
If so, you need to continue using 'git bisect good' and 'git bisect bad'
to find the offending commit. This looks just like the first step?
thanks,
greg k-h
On Wed, Mar 30, 2022 at 04:54:53PM -0400, Joshua Freedman wrote:
> This felt really anti-climactic haha, but hopefully it's useful?
> cat bisect.log
> Bisecting: 81 revisions left to test after this (roughly 6 steps)
> [770aac3c84e0c83a19985413fa9fbfc126cc0ff6] net: mdio-ipq4019: add delay
> after clock enable
Ok, great! Can you start a new thread and add the developers of that
change to to it to help track down the issue?
thanks,
greg k-h
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8cba323437a49a45756d661f500b324fc2d486fe Mon Sep 17 00:00:00 2001
From: Sean Nyekjaer <sean(a)geanix.com>
Date: Tue, 8 Feb 2022 09:52:13 +0100
Subject: [PATCH] mtd: rawnand: protect access to rawnand devices while in
suspend
Prevent rawnand access while in a suspended state.
Commit 013e6292aaf5 ("mtd: rawnand: Simplify the locking") allows the
rawnand layer to return errors rather than waiting in a blocking wait.
Tested on a iMX6ULL.
Fixes: 013e6292aaf5 ("mtd: rawnand: Simplify the locking")
Signed-off-by: Sean Nyekjaer <sean(a)geanix.com>
Reviewed-by: Boris Brezillon <boris.brezillon(a)collabora.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Miquel Raynal <miquel.raynal(a)bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20220208085213.1838273-1-sean@geanix.com
diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index 3e4a525ac3ca..612ae60e9763 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -338,16 +338,19 @@ static int nand_isbad_bbm(struct nand_chip *chip, loff_t ofs)
*
* Return: -EBUSY if the chip has been suspended, 0 otherwise
*/
-static int nand_get_device(struct nand_chip *chip)
+static void nand_get_device(struct nand_chip *chip)
{
- mutex_lock(&chip->lock);
- if (chip->suspended) {
+ /* Wait until the device is resumed. */
+ while (1) {
+ mutex_lock(&chip->lock);
+ if (!chip->suspended) {
+ mutex_lock(&chip->controller->lock);
+ return;
+ }
mutex_unlock(&chip->lock);
- return -EBUSY;
- }
- mutex_lock(&chip->controller->lock);
- return 0;
+ wait_event(chip->resume_wq, !chip->suspended);
+ }
}
/**
@@ -576,9 +579,7 @@ static int nand_block_markbad_lowlevel(struct nand_chip *chip, loff_t ofs)
nand_erase_nand(chip, &einfo, 0);
/* Write bad block marker to OOB */
- ret = nand_get_device(chip);
- if (ret)
- return ret;
+ nand_get_device(chip);
ret = nand_markbad_bbm(chip, ofs);
nand_release_device(chip);
@@ -3826,9 +3827,7 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from,
ops->mode != MTD_OPS_RAW)
return -ENOTSUPP;
- ret = nand_get_device(chip);
- if (ret)
- return ret;
+ nand_get_device(chip);
if (!ops->datbuf)
ret = nand_do_read_oob(chip, from, ops);
@@ -4415,13 +4414,11 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to,
struct mtd_oob_ops *ops)
{
struct nand_chip *chip = mtd_to_nand(mtd);
- int ret;
+ int ret = 0;
ops->retlen = 0;
- ret = nand_get_device(chip);
- if (ret)
- return ret;
+ nand_get_device(chip);
switch (ops->mode) {
case MTD_OPS_PLACE_OOB:
@@ -4481,9 +4478,7 @@ int nand_erase_nand(struct nand_chip *chip, struct erase_info *instr,
return -EIO;
/* Grab the lock and see if the device is available */
- ret = nand_get_device(chip);
- if (ret)
- return ret;
+ nand_get_device(chip);
/* Shift to get first page */
page = (int)(instr->addr >> chip->page_shift);
@@ -4570,7 +4565,7 @@ static void nand_sync(struct mtd_info *mtd)
pr_debug("%s: called\n", __func__);
/* Grab the lock and see if the device is available */
- WARN_ON(nand_get_device(chip));
+ nand_get_device(chip);
/* Release it and go back */
nand_release_device(chip);
}
@@ -4587,9 +4582,7 @@ static int nand_block_isbad(struct mtd_info *mtd, loff_t offs)
int ret;
/* Select the NAND device */
- ret = nand_get_device(chip);
- if (ret)
- return ret;
+ nand_get_device(chip);
nand_select_target(chip, chipnr);
@@ -4660,6 +4653,8 @@ static void nand_resume(struct mtd_info *mtd)
__func__);
}
mutex_unlock(&chip->lock);
+
+ wake_up_all(&chip->resume_wq);
}
/**
@@ -5438,6 +5433,7 @@ static int nand_scan_ident(struct nand_chip *chip, unsigned int maxchips,
chip->cur_cs = -1;
mutex_init(&chip->lock);
+ init_waitqueue_head(&chip->resume_wq);
/* Enforce the right timings for reset/detection */
chip->current_interface_config = nand_get_reset_interface_config();
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index 5b88cd51fadb..dcf90144d70b 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -1240,6 +1240,7 @@ struct nand_secure_region {
* @lock: Lock protecting the suspended field. Also used to serialize accesses
* to the NAND device
* @suspended: Set to 1 when the device is suspended, 0 when it's not
+ * @resume_wq: wait queue to sleep if rawnand is in suspended state.
* @cur_cs: Currently selected target. -1 means no target selected, otherwise we
* should always have cur_cs >= 0 && cur_cs < nanddev_ntargets().
* NAND Controller drivers should not modify this value, but they're
@@ -1294,6 +1295,7 @@ struct nand_chip {
/* Internals */
struct mutex lock;
unsigned int suspended : 1;
+ wait_queue_head_t resume_wq;
int cur_cs;
int read_retries;
struct nand_secure_region *secure_regions;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 151c6b49d679872d6fc0b50e0ad96303091694a2 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Mon, 28 Feb 2022 18:33:34 +0200
Subject: [PATCH] mtd: spi-nor: Skip erase logic when SPI_NOR_NO_ERASE is set
Even if SPI_NOR_NO_ERASE was set, one could still send erase opcodes
to the flash. It is not recommended to send unsupported opcodes to
flashes. Fix the logic and do not set mtd->_erase when SPI_NOR_NO_ERASE
is specified. With this users will not be able to issue erase opcodes to
flashes and instead they will recive an -ENOTSUPP error.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220228163334.277730-1-tudor.ambarus@microchip.c…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9014008e60b3..b4f141ad9c9c 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2948,10 +2948,11 @@ static void spi_nor_set_mtd_info(struct spi_nor *nor)
mtd->flags = MTD_CAP_NORFLASH;
if (nor->info->flags & SPI_NOR_NO_ERASE)
mtd->flags |= MTD_NO_ERASE;
+ else
+ mtd->_erase = spi_nor_erase;
mtd->writesize = nor->params->writesize;
mtd->writebufsize = nor->params->page_size;
mtd->size = nor->params->size;
- mtd->_erase = spi_nor_erase;
mtd->_read = spi_nor_read;
/* Might be already set by some SST flashes. */
if (!mtd->_write)
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 151c6b49d679872d6fc0b50e0ad96303091694a2 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Mon, 28 Feb 2022 18:33:34 +0200
Subject: [PATCH] mtd: spi-nor: Skip erase logic when SPI_NOR_NO_ERASE is set
Even if SPI_NOR_NO_ERASE was set, one could still send erase opcodes
to the flash. It is not recommended to send unsupported opcodes to
flashes. Fix the logic and do not set mtd->_erase when SPI_NOR_NO_ERASE
is specified. With this users will not be able to issue erase opcodes to
flashes and instead they will recive an -ENOTSUPP error.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220228163334.277730-1-tudor.ambarus@microchip.c…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9014008e60b3..b4f141ad9c9c 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2948,10 +2948,11 @@ static void spi_nor_set_mtd_info(struct spi_nor *nor)
mtd->flags = MTD_CAP_NORFLASH;
if (nor->info->flags & SPI_NOR_NO_ERASE)
mtd->flags |= MTD_NO_ERASE;
+ else
+ mtd->_erase = spi_nor_erase;
mtd->writesize = nor->params->writesize;
mtd->writebufsize = nor->params->page_size;
mtd->size = nor->params->size;
- mtd->_erase = spi_nor_erase;
mtd->_read = spi_nor_read;
/* Might be already set by some SST flashes. */
if (!mtd->_write)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 151c6b49d679872d6fc0b50e0ad96303091694a2 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Mon, 28 Feb 2022 18:33:34 +0200
Subject: [PATCH] mtd: spi-nor: Skip erase logic when SPI_NOR_NO_ERASE is set
Even if SPI_NOR_NO_ERASE was set, one could still send erase opcodes
to the flash. It is not recommended to send unsupported opcodes to
flashes. Fix the logic and do not set mtd->_erase when SPI_NOR_NO_ERASE
is specified. With this users will not be able to issue erase opcodes to
flashes and instead they will recive an -ENOTSUPP error.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220228163334.277730-1-tudor.ambarus@microchip.c…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9014008e60b3..b4f141ad9c9c 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2948,10 +2948,11 @@ static void spi_nor_set_mtd_info(struct spi_nor *nor)
mtd->flags = MTD_CAP_NORFLASH;
if (nor->info->flags & SPI_NOR_NO_ERASE)
mtd->flags |= MTD_NO_ERASE;
+ else
+ mtd->_erase = spi_nor_erase;
mtd->writesize = nor->params->writesize;
mtd->writebufsize = nor->params->page_size;
mtd->size = nor->params->size;
- mtd->_erase = spi_nor_erase;
mtd->_read = spi_nor_read;
/* Might be already set by some SST flashes. */
if (!mtd->_write)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 151c6b49d679872d6fc0b50e0ad96303091694a2 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Mon, 28 Feb 2022 18:33:34 +0200
Subject: [PATCH] mtd: spi-nor: Skip erase logic when SPI_NOR_NO_ERASE is set
Even if SPI_NOR_NO_ERASE was set, one could still send erase opcodes
to the flash. It is not recommended to send unsupported opcodes to
flashes. Fix the logic and do not set mtd->_erase when SPI_NOR_NO_ERASE
is specified. With this users will not be able to issue erase opcodes to
flashes and instead they will recive an -ENOTSUPP error.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220228163334.277730-1-tudor.ambarus@microchip.c…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9014008e60b3..b4f141ad9c9c 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2948,10 +2948,11 @@ static void spi_nor_set_mtd_info(struct spi_nor *nor)
mtd->flags = MTD_CAP_NORFLASH;
if (nor->info->flags & SPI_NOR_NO_ERASE)
mtd->flags |= MTD_NO_ERASE;
+ else
+ mtd->_erase = spi_nor_erase;
mtd->writesize = nor->params->writesize;
mtd->writebufsize = nor->params->page_size;
mtd->size = nor->params->size;
- mtd->_erase = spi_nor_erase;
mtd->_read = spi_nor_read;
/* Might be already set by some SST flashes. */
if (!mtd->_write)
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 151c6b49d679872d6fc0b50e0ad96303091694a2 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Mon, 28 Feb 2022 18:33:34 +0200
Subject: [PATCH] mtd: spi-nor: Skip erase logic when SPI_NOR_NO_ERASE is set
Even if SPI_NOR_NO_ERASE was set, one could still send erase opcodes
to the flash. It is not recommended to send unsupported opcodes to
flashes. Fix the logic and do not set mtd->_erase when SPI_NOR_NO_ERASE
is specified. With this users will not be able to issue erase opcodes to
flashes and instead they will recive an -ENOTSUPP error.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220228163334.277730-1-tudor.ambarus@microchip.c…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9014008e60b3..b4f141ad9c9c 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2948,10 +2948,11 @@ static void spi_nor_set_mtd_info(struct spi_nor *nor)
mtd->flags = MTD_CAP_NORFLASH;
if (nor->info->flags & SPI_NOR_NO_ERASE)
mtd->flags |= MTD_NO_ERASE;
+ else
+ mtd->_erase = spi_nor_erase;
mtd->writesize = nor->params->writesize;
mtd->writebufsize = nor->params->page_size;
mtd->size = nor->params->size;
- mtd->_erase = spi_nor_erase;
mtd->_read = spi_nor_read;
/* Might be already set by some SST flashes. */
if (!mtd->_write)
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 151c6b49d679872d6fc0b50e0ad96303091694a2 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Mon, 28 Feb 2022 18:33:34 +0200
Subject: [PATCH] mtd: spi-nor: Skip erase logic when SPI_NOR_NO_ERASE is set
Even if SPI_NOR_NO_ERASE was set, one could still send erase opcodes
to the flash. It is not recommended to send unsupported opcodes to
flashes. Fix the logic and do not set mtd->_erase when SPI_NOR_NO_ERASE
is specified. With this users will not be able to issue erase opcodes to
flashes and instead they will recive an -ENOTSUPP error.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220228163334.277730-1-tudor.ambarus@microchip.c…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9014008e60b3..b4f141ad9c9c 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2948,10 +2948,11 @@ static void spi_nor_set_mtd_info(struct spi_nor *nor)
mtd->flags = MTD_CAP_NORFLASH;
if (nor->info->flags & SPI_NOR_NO_ERASE)
mtd->flags |= MTD_NO_ERASE;
+ else
+ mtd->_erase = spi_nor_erase;
mtd->writesize = nor->params->writesize;
mtd->writebufsize = nor->params->page_size;
mtd->size = nor->params->size;
- mtd->_erase = spi_nor_erase;
mtd->_read = spi_nor_read;
/* Might be already set by some SST flashes. */
if (!mtd->_write)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 151c6b49d679872d6fc0b50e0ad96303091694a2 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Date: Mon, 28 Feb 2022 18:33:34 +0200
Subject: [PATCH] mtd: spi-nor: Skip erase logic when SPI_NOR_NO_ERASE is set
Even if SPI_NOR_NO_ERASE was set, one could still send erase opcodes
to the flash. It is not recommended to send unsupported opcodes to
flashes. Fix the logic and do not set mtd->_erase when SPI_NOR_NO_ERASE
is specified. With this users will not be able to issue erase opcodes to
flashes and instead they will recive an -ENOTSUPP error.
Fixes: b199489d37b2 ("mtd: spi-nor: add the framework for SPI NOR")
Signed-off-by: Tudor Ambarus <tudor.ambarus(a)microchip.com>
Reviewed-by: Michael Walle <michael(a)walle.cc>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20220228163334.277730-1-tudor.ambarus@microchip.c…
diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
index 9014008e60b3..b4f141ad9c9c 100644
--- a/drivers/mtd/spi-nor/core.c
+++ b/drivers/mtd/spi-nor/core.c
@@ -2948,10 +2948,11 @@ static void spi_nor_set_mtd_info(struct spi_nor *nor)
mtd->flags = MTD_CAP_NORFLASH;
if (nor->info->flags & SPI_NOR_NO_ERASE)
mtd->flags |= MTD_NO_ERASE;
+ else
+ mtd->_erase = spi_nor_erase;
mtd->writesize = nor->params->writesize;
mtd->writebufsize = nor->params->page_size;
mtd->size = nor->params->size;
- mtd->_erase = spi_nor_erase;
mtd->_read = spi_nor_read;
/* Might be already set by some SST flashes. */
if (!mtd->_write)
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 50ebd19e3585b9792e994cfa8cbee8947fe06371 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk(a)kernel.org>
Date: Tue, 11 Jan 2022 21:13:59 +0100
Subject: [PATCH] pinctrl: samsung: drop pin banks references on error paths
The driver iterates over its devicetree children with
for_each_child_of_node() and stores for later found node pointer. This
has to be put in error paths to avoid leak during re-probing.
Fixes: ab663789d697 ("pinctrl: samsung: Match pin banks with their device nodes")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski(a)canonical.com>
Reviewed-by: Sam Protsenko <semen.protsenko(a)linaro.org>
Reviewed-by: Chanho Park <chanho61.park(a)samsung.com>
Link: https://lore.kernel.org/r/20220111201426.326777-2-krzysztof.kozlowski@canon…
diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c
index 0f6e9305fec5..c4175fea7d74 100644
--- a/drivers/pinctrl/samsung/pinctrl-samsung.c
+++ b/drivers/pinctrl/samsung/pinctrl-samsung.c
@@ -1002,6 +1002,16 @@ samsung_pinctrl_get_soc_data_for_of_alias(struct platform_device *pdev)
return &(of_data->ctrl[id]);
}
+static void samsung_banks_of_node_put(struct samsung_pinctrl_drv_data *d)
+{
+ struct samsung_pin_bank *bank;
+ unsigned int i;
+
+ bank = d->pin_banks;
+ for (i = 0; i < d->nr_banks; ++i, ++bank)
+ of_node_put(bank->of_node);
+}
+
/* retrieve the soc specific data */
static const struct samsung_pin_ctrl *
samsung_pinctrl_get_soc_data(struct samsung_pinctrl_drv_data *d,
@@ -1117,19 +1127,19 @@ static int samsung_pinctrl_probe(struct platform_device *pdev)
if (ctrl->retention_data) {
drvdata->retention_ctrl = ctrl->retention_data->init(drvdata,
ctrl->retention_data);
- if (IS_ERR(drvdata->retention_ctrl))
- return PTR_ERR(drvdata->retention_ctrl);
+ if (IS_ERR(drvdata->retention_ctrl)) {
+ ret = PTR_ERR(drvdata->retention_ctrl);
+ goto err_put_banks;
+ }
}
ret = samsung_pinctrl_register(pdev, drvdata);
if (ret)
- return ret;
+ goto err_put_banks;
ret = samsung_gpiolib_register(pdev, drvdata);
- if (ret) {
- samsung_pinctrl_unregister(pdev, drvdata);
- return ret;
- }
+ if (ret)
+ goto err_unregister;
if (ctrl->eint_gpio_init)
ctrl->eint_gpio_init(drvdata);
@@ -1139,6 +1149,12 @@ static int samsung_pinctrl_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, drvdata);
return 0;
+
+err_unregister:
+ samsung_pinctrl_unregister(pdev, drvdata);
+err_put_banks:
+ samsung_banks_of_node_put(drvdata);
+ return ret;
}
/*
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b622ffe1d9ecbac71f0cddb52ff0831efdf8fb83 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust(a)hammerspace.com>
Date: Tue, 22 Feb 2022 18:20:38 -0500
Subject: [PATCH] NFS: NFSv2/v3 clients should never be setting NFS_CAP_XATTR
Ensure that we always initialise the 'xattr_support' field in struct
nfs_fsinfo, so that nfs_server_set_fsinfo() doesn't declare our NFSv2/v3
client to be capable of supporting the NFSv4.2 xattr protocol by setting
the NFS_CAP_XATTR capability.
This configuration can cause nfs_do_access() to set access mode bits
that are unsupported by the NFSv3 ACCESS call, which may confuse
spec-compliant servers.
Reported-by: Olga Kornievskaia <kolga(a)netapp.com>
Fixes: b78ef845c35d ("NFSv4.2: query the server for extended attribute support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust(a)hammerspace.com>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9274c9c5efea..54a1d21cbcc6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2228,6 +2228,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
/* ignore properties */
result->lease_time = 0;
result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ result->xattr_support = 0;
return 0;
}
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 73dcaa99fa9b..e3570c656b0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -92,6 +92,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
info->maxfilesize = 0x7FFFFFFF;
info->lease_time = 0;
info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+ info->xattr_support = 0;
return 0;
}
Bios queued into BFQ IO scheduler can be associated with a cgroup that
was already offlined. This may then cause insertion of this bfq_group
into a service tree. But this bfq_group will get freed as soon as last
bio associated with it is completed leading to use after free issues for
service tree users. Fix the problem by making sure we always operate on
online bfq_group. If the bfq_group associated with the bio is not
online, we pick the first online parent.
CC: stable(a)vger.kernel.org
Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Tested-by: "yukuai (C)" <yukuai3(a)huawei.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
block/bfq-cgroup.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 32d2c2a47480..09574af83566 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -612,10 +612,19 @@ static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg)
struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;
+ struct bfq_group *bfqg;
- if (!blkg)
- return bfqd->root_group;
- return blkg_to_bfqg(blkg);
+ while (blkg) {
+ bfqg = blkg_to_bfqg(blkg);
+ if (bfqg->online) {
+ bio_associate_blkg_from_css(bio, &blkg->blkcg->css);
+ return bfqg;
+ }
+ blkg = blkg->parent;
+ }
+ bio_associate_blkg_from_css(bio,
+ &bfqg_to_blkg(bfqd->root_group)->blkcg->css);
+ return bfqd->root_group;
}
/**
--
2.34.1
Track whether bfq_group is still online. We cannot rely on
blkcg_gq->online because that gets cleared only after all policies are
offlined and we need something that gets updated already under
bfqd->lock when we are cleaning up our bfq_group to be able to guarantee
that when we see online bfq_group, it will stay online while we are
holding bfqd->lock lock.
CC: stable(a)vger.kernel.org
Tested-by: "yukuai (C)" <yukuai3(a)huawei.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
block/bfq-cgroup.c | 3 ++-
block/bfq-iosched.h | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9352f3cc2377..879380c2bc7e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -557,6 +557,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
*/
bfqg->bfqd = bfqd;
bfqg->active_entities = 0;
+ bfqg->online = true;
bfqg->rq_pos_tree = RB_ROOT;
}
@@ -603,7 +604,6 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
struct bfq_entity *entity;
bfqg = bfq_lookup_bfqg(bfqd, blkcg);
-
if (unlikely(!bfqg))
return NULL;
@@ -979,6 +979,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
put_async_queues:
bfq_put_async_queues(bfqd, bfqg);
+ bfqg->online = false;
spin_unlock_irqrestore(&bfqd->lock, flags);
/*
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index a56763045d19..4664e2f3e828 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -928,6 +928,8 @@ struct bfq_group {
/* reference counter (see comments in bfq_bic_update_cgroup) */
int ref;
+ /* Is bfq_group still online? */
+ bool online;
struct bfq_entity entity;
struct bfq_sched_data sched_data;
--
2.34.1
In bfq_insert_request() we unlock bfqd->lock only to call
trace_block_rq_insert() and then lock bfqd->lock again. This is really
pointless since tracing is disabled if we really care about performance
and even if the tracepoint is enabled, it is a quick call.
CC: stable(a)vger.kernel.org
Tested-by: "yukuai (C)" <yukuai3(a)huawei.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
block/bfq-iosched.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 1fc4d4628fba..19082e14f3c1 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6150,11 +6150,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
return;
}
- spin_unlock_irq(&bfqd->lock);
-
trace_block_rq_insert(rq);
- spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
if (!bfqq || at_head) {
if (at_head)
--
2.34.1
When the process is migrated to a different cgroup (or in case of
writeback just starts submitting bios associated with a different
cgroup) bfq_merge_bio() can operate with stale cgroup information in
bic. Thus the bio can be merged to a request from a different cgroup or
it can result in merging of bfqqs for different cgroups or bfqqs of
already dead cgroups and causing possible use-after-free issues. Fix the
problem by updating cgroup information in bfq_merge_bio().
CC: stable(a)vger.kernel.org
Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Tested-by: "yukuai (C)" <yukuai3(a)huawei.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
block/bfq-iosched.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 89fe3f85eb3c..1fc4d4628fba 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2457,10 +2457,17 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
spin_lock_irq(&bfqd->lock);
- if (bic)
+ if (bic) {
+ /*
+ * Make sure cgroup info is uptodate for current process before
+ * considering the merge.
+ */
+ bfq_bic_update_cgroup(bic, bio);
+
bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
- else
+ } else {
bfqd->bio_bfqq = NULL;
+ }
bfqd->bio_bic = bic;
ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
--
2.34.1
When bfqq is shared by multiple processes it can happen that one of the
processes gets moved to a different cgroup (or just starts submitting IO
for different cgroup). In case that happens we need to split the merged
bfqq as otherwise we will have IO for multiple cgroups in one bfqq and
we will just account IO time to wrong entities etc.
Similarly if the bfqq is scheduled to merge with another bfqq but the
merge didn't happen yet, cancel the merge as it need not be valid
anymore.
CC: stable(a)vger.kernel.org
Fixes: e21b7a0b9887 ("block, bfq: add full hierarchical scheduling and cgroups support")
Tested-by: "yukuai (C)" <yukuai3(a)huawei.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
block/bfq-cgroup.c | 36 +++++++++++++++++++++++++++++++++---
block/bfq-iosched.c | 2 +-
block/bfq-iosched.h | 1 +
3 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 420eda2589c0..9352f3cc2377 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -743,9 +743,39 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
}
if (sync_bfqq) {
- entity = &sync_bfqq->entity;
- if (entity->sched_data != &bfqg->sched_data)
- bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
+ /* We are the only user of this bfqq, just move it */
+ if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ } else {
+ struct bfq_queue *bfqq;
+
+ /*
+ * The queue was merged to a different queue. Check
+ * that the merge chain still belongs to the same
+ * cgroup.
+ */
+ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
+ if (bfqq->entity.sched_data !=
+ &bfqg->sched_data)
+ break;
+ if (bfqq) {
+ /*
+ * Some queue changed cgroup so the merge is
+ * not valid anymore. We cannot easily just
+ * cancel the merge (by clearing new_bfqq) as
+ * there may be other processes using this
+ * queue and holding refs to all queues below
+ * sync_bfqq->new_bfqq. Similarly if the merge
+ * already happened, we need to detach from
+ * bfqq now so that we cannot merge bio to a
+ * request from the old cgroup.
+ */
+ bfq_put_cooperator(sync_bfqq);
+ bfq_release_process_ref(bfqd, sync_bfqq);
+ bic_set_bfqq(bic, NULL, 1);
+ }
+ }
}
return bfqg;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 7d00b21ebe5d..89fe3f85eb3c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5315,7 +5315,7 @@ static void bfq_put_stable_ref(struct bfq_queue *bfqq)
bfq_put_queue(bfqq);
}
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
+void bfq_put_cooperator(struct bfq_queue *bfqq)
{
struct bfq_queue *__bfqq, *next;
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 3b83e3d1c2e5..a56763045d19 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -979,6 +979,7 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
+void bfq_put_cooperator(struct bfq_queue *bfqq);
void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_schedule_dispatch(struct bfq_data *bfqd);
--
2.34.1
It can happen that the parent of a bfqq changes between the moment we
decide two queues are worth to merge (and set bic->stable_merge_bfqq)
and the moment bfq_setup_merge() is called. This can happen e.g. because
the process submitted IO for a different cgroup and thus bfqq got
reparented. It can even happen that the bfqq we are merging with has
parent cgroup that is already offline and going to be destroyed in which
case the merge can lead to use-after-free issues such as:
BUG: KASAN: use-after-free in __bfq_deactivate_entity+0x9cb/0xa50
Read of size 8 at addr ffff88800693c0c0 by task runc:[2:INIT]/10544
CPU: 0 PID: 10544 Comm: runc:[2:INIT] Tainted: G E 5.15.2-0.g5fb85fd-default #1 openSUSE Tumbleweed (unreleased) f1f3b891c72369aebecd2e43e4641a6358867c70
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014
Call Trace:
<IRQ>
dump_stack_lvl+0x46/0x5a
print_address_description.constprop.0+0x1f/0x140
? __bfq_deactivate_entity+0x9cb/0xa50
kasan_report.cold+0x7f/0x11b
? __bfq_deactivate_entity+0x9cb/0xa50
__bfq_deactivate_entity+0x9cb/0xa50
? update_curr+0x32f/0x5d0
bfq_deactivate_entity+0xa0/0x1d0
bfq_del_bfqq_busy+0x28a/0x420
? resched_curr+0x116/0x1d0
? bfq_requeue_bfqq+0x70/0x70
? check_preempt_wakeup+0x52b/0xbc0
__bfq_bfqq_expire+0x1a2/0x270
bfq_bfqq_expire+0xd16/0x2160
? try_to_wake_up+0x4ee/0x1260
? bfq_end_wr_async_queues+0xe0/0xe0
? _raw_write_unlock_bh+0x60/0x60
? _raw_spin_lock_irq+0x81/0xe0
bfq_idle_slice_timer+0x109/0x280
? bfq_dispatch_request+0x4870/0x4870
__hrtimer_run_queues+0x37d/0x700
? enqueue_hrtimer+0x1b0/0x1b0
? kvm_clock_get_cycles+0xd/0x10
? ktime_get_update_offsets_now+0x6f/0x280
hrtimer_interrupt+0x2c8/0x740
Fix the problem by checking that the parent of the two bfqqs we are
merging in bfq_setup_merge() is the same.
Link: https://lore.kernel.org/linux-block/20211125172809.GC19572@quack2.suse.cz/
CC: stable(a)vger.kernel.org
Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues")
Tested-by: "yukuai (C)" <yukuai3(a)huawei.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
block/bfq-iosched.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 6d122c28086e..7d00b21ebe5d 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2758,6 +2758,14 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
if (process_refs == 0 || new_process_refs == 0)
return NULL;
+ /*
+ * Make sure merged queues belong to the same parent. Parents could
+ * have changed since the time we decided the two queues are suitable
+ * for merging.
+ */
+ if (new_bfqq->entity.parent != bfqq->entity.parent)
+ return NULL;
+
bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
new_bfqq->pid);
--
2.34.1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b850b7a8b369322adf699ef48ceff4d902525c8c Mon Sep 17 00:00:00 2001
From: Ang Tien Sung <tien.sung.ang(a)intel.com>
Date: Wed, 23 Feb 2022 08:41:46 -0600
Subject: [PATCH] firmware: stratix10-svc: add missing callback parameter on
RSU
Fix a bug whereby, the return response of parameter a1 from an
SMC call is not properly set to the callback data during an
INTEL_SIP_SMC_RSU_ERROR command.
Link: https://lore.kernel.org/lkml/20220216081513.28319-1-tien.sung.ang@intel.com
Fixes: 6b50d882d38d ("firmware: add remote status update client support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ang Tien Sung <tien.sung.ang(a)intel.com>
Signed-off-by: Dinh Nguyen <dinguyen(a)kernel.org>
Link: https://lore.kernel.org/r/20220223144146.399263-1-dinguyen@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c
index 4bd57a908efe..8177a0fae11d 100644
--- a/drivers/firmware/stratix10-svc.c
+++ b/drivers/firmware/stratix10-svc.c
@@ -483,7 +483,7 @@ static int svc_normal_to_secure_thread(void *data)
case INTEL_SIP_SMC_RSU_ERROR:
pr_err("%s: STATUS_ERROR\n", __func__);
cbdata->status = BIT(SVC_STATUS_ERROR);
- cbdata->kaddr1 = NULL;
+ cbdata->kaddr1 = &res.a1;
cbdata->kaddr2 = NULL;
cbdata->kaddr3 = NULL;
pdata->chan->scl->receive_cb(pdata->chan->scl, cbdata);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 571426631acf46e2999c7ecd1e9d048172969a43 Mon Sep 17 00:00:00 2001
From: Billy Tsai <billy_tsai(a)aspeedtech.com>
Date: Mon, 21 Feb 2022 09:27:05 +0800
Subject: [PATCH] iio: adc: aspeed: Add divider flag to fix incorrect voltage
reading.
The formula for the ADC sampling period in ast2400/ast2500 is:
ADC clock period = PCLK * 2 * (ADC0C[31:17] + 1) * (ADC0C[9:0])
When ADC0C[9:0] is set to 0 the sampling voltage will be lower than
expected, because the hardware may not have enough time to
charge/discharge to a stable voltage. This patch use the flag
CLK_DIVIDER_ONE_BASED which will use the raw value read from the
register, with the value of zero considered invalid to conform to the
corrected formula.
Fixes: 573803234e72 ("iio: Aspeed ADC")
Reported-by: Konstantin Klubnichkin <kitsok(a)yandex-team.ru>
Signed-off-by: Billy Tsai <billy_tsai(a)aspeedtech.com>
Reviewed-by: Joel Stanley <joel(a)jms.id.au>
Link: https://lore.kernel.org/r/20220221012705.22008-1-billy_tsai@aspeedtech.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c
index e939b84cbb56..0793d2474cdc 100644
--- a/drivers/iio/adc/aspeed_adc.c
+++ b/drivers/iio/adc/aspeed_adc.c
@@ -539,7 +539,9 @@ static int aspeed_adc_probe(struct platform_device *pdev)
data->clk_scaler = devm_clk_hw_register_divider(
&pdev->dev, clk_name, clk_parent_name, scaler_flags,
data->base + ASPEED_REG_CLOCK_CONTROL, 0,
- data->model_data->scaler_bit_width, 0, &data->clk_lock);
+ data->model_data->scaler_bit_width,
+ data->model_data->need_prescaler ? CLK_DIVIDER_ONE_BASED : 0,
+ &data->clk_lock);
if (IS_ERR(data->clk_scaler))
return PTR_ERR(data->clk_scaler);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 571426631acf46e2999c7ecd1e9d048172969a43 Mon Sep 17 00:00:00 2001
From: Billy Tsai <billy_tsai(a)aspeedtech.com>
Date: Mon, 21 Feb 2022 09:27:05 +0800
Subject: [PATCH] iio: adc: aspeed: Add divider flag to fix incorrect voltage
reading.
The formula for the ADC sampling period in ast2400/ast2500 is:
ADC clock period = PCLK * 2 * (ADC0C[31:17] + 1) * (ADC0C[9:0])
When ADC0C[9:0] is set to 0 the sampling voltage will be lower than
expected, because the hardware may not have enough time to
charge/discharge to a stable voltage. This patch use the flag
CLK_DIVIDER_ONE_BASED which will use the raw value read from the
register, with the value of zero considered invalid to conform to the
corrected formula.
Fixes: 573803234e72 ("iio: Aspeed ADC")
Reported-by: Konstantin Klubnichkin <kitsok(a)yandex-team.ru>
Signed-off-by: Billy Tsai <billy_tsai(a)aspeedtech.com>
Reviewed-by: Joel Stanley <joel(a)jms.id.au>
Link: https://lore.kernel.org/r/20220221012705.22008-1-billy_tsai@aspeedtech.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c
index e939b84cbb56..0793d2474cdc 100644
--- a/drivers/iio/adc/aspeed_adc.c
+++ b/drivers/iio/adc/aspeed_adc.c
@@ -539,7 +539,9 @@ static int aspeed_adc_probe(struct platform_device *pdev)
data->clk_scaler = devm_clk_hw_register_divider(
&pdev->dev, clk_name, clk_parent_name, scaler_flags,
data->base + ASPEED_REG_CLOCK_CONTROL, 0,
- data->model_data->scaler_bit_width, 0, &data->clk_lock);
+ data->model_data->scaler_bit_width,
+ data->model_data->need_prescaler ? CLK_DIVIDER_ONE_BASED : 0,
+ &data->clk_lock);
if (IS_ERR(data->clk_scaler))
return PTR_ERR(data->clk_scaler);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 571426631acf46e2999c7ecd1e9d048172969a43 Mon Sep 17 00:00:00 2001
From: Billy Tsai <billy_tsai(a)aspeedtech.com>
Date: Mon, 21 Feb 2022 09:27:05 +0800
Subject: [PATCH] iio: adc: aspeed: Add divider flag to fix incorrect voltage
reading.
The formula for the ADC sampling period in ast2400/ast2500 is:
ADC clock period = PCLK * 2 * (ADC0C[31:17] + 1) * (ADC0C[9:0])
When ADC0C[9:0] is set to 0 the sampling voltage will be lower than
expected, because the hardware may not have enough time to
charge/discharge to a stable voltage. This patch use the flag
CLK_DIVIDER_ONE_BASED which will use the raw value read from the
register, with the value of zero considered invalid to conform to the
corrected formula.
Fixes: 573803234e72 ("iio: Aspeed ADC")
Reported-by: Konstantin Klubnichkin <kitsok(a)yandex-team.ru>
Signed-off-by: Billy Tsai <billy_tsai(a)aspeedtech.com>
Reviewed-by: Joel Stanley <joel(a)jms.id.au>
Link: https://lore.kernel.org/r/20220221012705.22008-1-billy_tsai@aspeedtech.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c
index e939b84cbb56..0793d2474cdc 100644
--- a/drivers/iio/adc/aspeed_adc.c
+++ b/drivers/iio/adc/aspeed_adc.c
@@ -539,7 +539,9 @@ static int aspeed_adc_probe(struct platform_device *pdev)
data->clk_scaler = devm_clk_hw_register_divider(
&pdev->dev, clk_name, clk_parent_name, scaler_flags,
data->base + ASPEED_REG_CLOCK_CONTROL, 0,
- data->model_data->scaler_bit_width, 0, &data->clk_lock);
+ data->model_data->scaler_bit_width,
+ data->model_data->need_prescaler ? CLK_DIVIDER_ONE_BASED : 0,
+ &data->clk_lock);
if (IS_ERR(data->clk_scaler))
return PTR_ERR(data->clk_scaler);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 571426631acf46e2999c7ecd1e9d048172969a43 Mon Sep 17 00:00:00 2001
From: Billy Tsai <billy_tsai(a)aspeedtech.com>
Date: Mon, 21 Feb 2022 09:27:05 +0800
Subject: [PATCH] iio: adc: aspeed: Add divider flag to fix incorrect voltage
reading.
The formula for the ADC sampling period in ast2400/ast2500 is:
ADC clock period = PCLK * 2 * (ADC0C[31:17] + 1) * (ADC0C[9:0])
When ADC0C[9:0] is set to 0 the sampling voltage will be lower than
expected, because the hardware may not have enough time to
charge/discharge to a stable voltage. This patch use the flag
CLK_DIVIDER_ONE_BASED which will use the raw value read from the
register, with the value of zero considered invalid to conform to the
corrected formula.
Fixes: 573803234e72 ("iio: Aspeed ADC")
Reported-by: Konstantin Klubnichkin <kitsok(a)yandex-team.ru>
Signed-off-by: Billy Tsai <billy_tsai(a)aspeedtech.com>
Reviewed-by: Joel Stanley <joel(a)jms.id.au>
Link: https://lore.kernel.org/r/20220221012705.22008-1-billy_tsai@aspeedtech.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c
index e939b84cbb56..0793d2474cdc 100644
--- a/drivers/iio/adc/aspeed_adc.c
+++ b/drivers/iio/adc/aspeed_adc.c
@@ -539,7 +539,9 @@ static int aspeed_adc_probe(struct platform_device *pdev)
data->clk_scaler = devm_clk_hw_register_divider(
&pdev->dev, clk_name, clk_parent_name, scaler_flags,
data->base + ASPEED_REG_CLOCK_CONTROL, 0,
- data->model_data->scaler_bit_width, 0, &data->clk_lock);
+ data->model_data->scaler_bit_width,
+ data->model_data->need_prescaler ? CLK_DIVIDER_ONE_BASED : 0,
+ &data->clk_lock);
if (IS_ERR(data->clk_scaler))
return PTR_ERR(data->clk_scaler);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 571426631acf46e2999c7ecd1e9d048172969a43 Mon Sep 17 00:00:00 2001
From: Billy Tsai <billy_tsai(a)aspeedtech.com>
Date: Mon, 21 Feb 2022 09:27:05 +0800
Subject: [PATCH] iio: adc: aspeed: Add divider flag to fix incorrect voltage
reading.
The formula for the ADC sampling period in ast2400/ast2500 is:
ADC clock period = PCLK * 2 * (ADC0C[31:17] + 1) * (ADC0C[9:0])
When ADC0C[9:0] is set to 0 the sampling voltage will be lower than
expected, because the hardware may not have enough time to
charge/discharge to a stable voltage. This patch use the flag
CLK_DIVIDER_ONE_BASED which will use the raw value read from the
register, with the value of zero considered invalid to conform to the
corrected formula.
Fixes: 573803234e72 ("iio: Aspeed ADC")
Reported-by: Konstantin Klubnichkin <kitsok(a)yandex-team.ru>
Signed-off-by: Billy Tsai <billy_tsai(a)aspeedtech.com>
Reviewed-by: Joel Stanley <joel(a)jms.id.au>
Link: https://lore.kernel.org/r/20220221012705.22008-1-billy_tsai@aspeedtech.com
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/adc/aspeed_adc.c b/drivers/iio/adc/aspeed_adc.c
index e939b84cbb56..0793d2474cdc 100644
--- a/drivers/iio/adc/aspeed_adc.c
+++ b/drivers/iio/adc/aspeed_adc.c
@@ -539,7 +539,9 @@ static int aspeed_adc_probe(struct platform_device *pdev)
data->clk_scaler = devm_clk_hw_register_divider(
&pdev->dev, clk_name, clk_parent_name, scaler_flags,
data->base + ASPEED_REG_CLOCK_CONTROL, 0,
- data->model_data->scaler_bit_width, 0, &data->clk_lock);
+ data->model_data->scaler_bit_width,
+ data->model_data->need_prescaler ? CLK_DIVIDER_ONE_BASED : 0,
+ &data->clk_lock);
if (IS_ERR(data->clk_scaler))
return PTR_ERR(data->clk_scaler);
Component match callback function needs to check if expected data is
passed to it. Without this check, it can cause a NULL pointer
dereference when another driver registers a component before i915
drivers have their component master fully bind.
Fixes: 7b882fe3e3e8b ("ALSA: hda - handle multiple i915 device instances")
Signed-off-by: Heikki Krogerus <heikki.krogerus(a)linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
Signed-off-by: Won Chung <wonchung(a)google.com>
---
- Add "Fixes" tag
- Send to stable(a)vger.kernel.org
sound/hda/hdac_i915.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
index efe810af28c5..958b0975fa40 100644
--- a/sound/hda/hdac_i915.c
+++ b/sound/hda/hdac_i915.c
@@ -102,13 +102,13 @@ static int i915_component_master_match(struct device *dev, int subcomponent,
struct pci_dev *hdac_pci, *i915_pci;
struct hdac_bus *bus = data;
- if (!dev_is_pci(dev))
+ if (!dev_is_pci(dev) || !bus)
return 0;
hdac_pci = to_pci_dev(bus->dev);
i915_pci = to_pci_dev(dev);
- if (!strcmp(dev->driver->name, "i915") &&
+ if (dev->driver && !strcmp(dev->driver->name, "i915") &&
subcomponent == I915_COMPONENT_AUDIO &&
connectivity_check(i915_pci, hdac_pci))
return 1;
--
2.35.1.1021.g381101b075-goog
The patch titled
Subject: mm/mempolicy: fix mpol_new leak in shared_policy_replace
has been added to the -mm tree. Its filename is
mm-mempolicy-fix-mpol_new-leak-in-shared_policy_replace.patch
This patch should soon appear at
https://ozlabs.org/~akpm/mmots/broken-out/mm-mempolicy-fix-mpol_new-leak-in…
and later at
https://ozlabs.org/~akpm/mmotm/broken-out/mm-mempolicy-fix-mpol_new-leak-in…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: Miaohe Lin <linmiaohe(a)huawei.com>
Subject: mm/mempolicy: fix mpol_new leak in shared_policy_replace
If mpol_new is allocated but not used in restart loop, mpol_new will be
freed via mpol_put before returning to the caller. But refcnt is not
initialized yet, so mpol_put could not do the right things and might leak
the unused mpol_new. This would happen if mempolicy was updated on the
shared shmem file while the sp->lock has been dropped during the memory
allocation.
This issue could be triggered easily with the below code snippet if there
are many processes doing the below work at the same time:
shmid = shmget((key_t)5566, 1024 * PAGE_SIZE, 0666|IPC_CREAT);
shm = shmat(shmid, 0, 0);
loop many times {
mbind(shm, 1024 * PAGE_SIZE, MPOL_LOCAL, mask, maxnode, 0);
mbind(shm + 128 * PAGE_SIZE, 128 * PAGE_SIZE, MPOL_DEFAULT, mask,
maxnode, 0);
}
Link: https://lkml.kernel.org/r/20220329111416.27954-1-linmiaohe@huawei.com
Fixes: 42288fe366c4 ("mm: mempolicy: Convert shared_policy mutex to spinlock")
Signed-off-by: Miaohe Lin <linmiaohe(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: KOSAKI Motohiro <kosaki.motohiro(a)jp.fujitsu.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: <stable(a)vger.kernel.org> [3.8
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mempolicy.c | 1 +
1 file changed, 1 insertion(+)
--- a/mm/mempolicy.c~mm-mempolicy-fix-mpol_new-leak-in-shared_policy_replace
+++ a/mm/mempolicy.c
@@ -2733,6 +2733,7 @@ alloc_new:
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
+ atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
_
Patches currently in -mm which might be from linmiaohe(a)huawei.com are
mm-mempolicy-fix-mpol_new-leak-in-shared_policy_replace.patch
mm-shmem-make-shmem_init-return-void.patch
mm-memcg-remove-unneeded-nr_scanned.patch
mm-mremap-use-helper-mlock_future_check.patch
mm-z3fold-declare-z3fold_mount-with-__init.patch
mm-z3fold-remove-obsolete-comment-in-z3fold_alloc.patch
mm-z3fold-minor-clean-up-for-z3fold_free.patch
mm-z3fold-remove-unneeded-page_mapcount_reset-and-clearpageprivate.patch
mm-z3fold-remove-confusing-local-variable-l-reassignment.patch
mm-z3fold-move-decrement-of-pool-pages_nr-into-__release_z3fold_page.patch
mm-z3fold-remove-redundant-list_del_init-of-zhdr-buddy-in-z3fold_free.patch
mm-z3fold-remove-unneeded-page_headless-check-in-free_handle.patch
mm-compaction-use-helper-isolation_suitable.patch
mm-migration-remove-unneeded-local-variable-mapping_locked.patch
mm-migration-remove-unneeded-out-label.patch
mm-migration-remove-unneeded-local-variable-page_lru.patch
mm-migration-fix-the-confusing-pagetranshuge-check.patch
mm-migration-use-helper-function-vma_lookup-in-add_page_for_migration.patch
mm-migration-use-helper-macro-min-in-do_pages_stat.patch
mm-migration-avoid-unneeded-nodemask_t-initialization.patch
mm-migration-remove-some-duplicated-codes-in-migrate_pages.patch
mm-migration-fix-potential-page-refcounts-leak-in-migrate_pages.patch
mm-migration-fix-potential-invalid-node-access-for-reclaim-based-migration.patch
mm-migration-fix-possible-do_pages_stat_array-racing-with-memory-offline.patch
From: David Stevens <stevensd(a)chromium.org>
Calculate the appropriate mask for non-size-aligned page selective
invalidation. Since psi uses the mask value to mask out the lower order
bits of the target address, properly flushing the iotlb requires using a
mask value such that [pfn, pfn+pages) all lie within the flushed
size-aligned region. This is not normally an issue because iova.c
always allocates iovas that are aligned to their size. However, iovas
which come from other sources (e.g. userspace via VFIO) may not be
aligned.
To properly flush the IOTLB, both the start and end pfns need to be
equal after applying the mask. That means that the most efficient mask
to use is the index of the lowest bit that is equal where all higher
bits are also equal. For example, if pfn=0x17f and pages=3, then
end_pfn=0x181, so the smallest mask we can use is 8. Any differences
above the highest bit of pages are due to carrying, so by xnor'ing pfn
and end_pfn and then masking out the lower order bits based on pages, we
get 0xffffff00, where the first set bit is the mask we want to use.
Fixes: 6fe1010d6d9c ("vfio/type1: DMA unmap chunking")
Cc: stable(a)vger.kernel.org
Signed-off-by: David Stevens <stevensd(a)chromium.org>
Reviewed-by: Kevin Tian <kevin.tian(a)intel.com>
---
The seeds of the bug were introduced by f76aec76ec7f6, which
simultaniously added the alignement requirement to the iommu driver and
made the iova allocator return aligned iovas. However, I don't think
there was any way to trigger the bug at that time. The tagged VFIO
change is one that actually introduced a code path that could trigger
the bug. There may also be other ways to trigger the bug that I am not
aware of.
v1 -> v2:
- Calculate an appropriate mask for non-size-aligned iovas instead
of falling back to domain selective flush.
v2 -> v3:
- Add more detail to commit message.
drivers/iommu/intel/iommu.c | 27 ++++++++++++++++++++++++---
1 file changed, 24 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 5b196cfe9ed2..ab2273300346 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1717,7 +1717,8 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
unsigned long pfn, unsigned int pages,
int ih, int map)
{
- unsigned int mask = ilog2(__roundup_pow_of_two(pages));
+ unsigned int aligned_pages = __roundup_pow_of_two(pages);
+ unsigned int mask = ilog2(aligned_pages);
uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
u16 did = domain->iommu_did[iommu->seq_id];
@@ -1729,10 +1730,30 @@ static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
if (domain_use_first_level(domain)) {
domain_flush_piotlb(iommu, domain, addr, pages, ih);
} else {
+ unsigned long bitmask = aligned_pages - 1;
+
+ /*
+ * PSI masks the low order bits of the base address. If the
+ * address isn't aligned to the mask, then compute a mask value
+ * needed to ensure the target range is flushed.
+ */
+ if (unlikely(bitmask & pfn)) {
+ unsigned long end_pfn = pfn + pages - 1, shared_bits;
+
+ /*
+ * Since end_pfn <= pfn + bitmask, the only way bits
+ * higher than bitmask can differ in pfn and end_pfn is
+ * by carrying. This means after masking out bitmask,
+ * high bits starting with the first set bit in
+ * shared_bits are all equal in both pfn and end_pfn.
+ */
+ shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
+ mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
+ }
+
/*
* Fallback to domain selective flush if no PSI support or
- * the size is too big. PSI requires page size to be 2 ^ x,
- * and the base address is naturally aligned to the size.
+ * the size is too big.
*/
if (!cap_pgsel_inv(iommu->cap) ||
mask > cap_max_amask_val(iommu->cap))
--
2.35.1.1094.g7c7d902a7c-goog