With a recent change to our send path for FSF commands we introduced a
possible use-after-free of request-objects, that might further lead to zfcp
crafting bad requests, which the FCP channel correctly complains about with
an error (FSF_PROT_SEQ_NUMB_ERROR). This error is then handled by an
adapter-wide recovery.
The following sequence illustrates the possible use-after-free:
Send Path:
int zfcp_fsf_open_port(struct zfcp_erp_action *erp_action)
{
struct zfcp_fsf_req *req;
...
spin_lock_irq(&qdio->req_q_lock);
// ^^^^^^^^^^^^^^^^
// protects QDIO queue during sending
...
req = zfcp_fsf_req_create(qdio,
FSF_QTCB_OPEN_PORT_WITH_DID,
SBAL_SFLAGS0_TYPE_READ,
qdio->adapter->pool.erp_req);
// ^^^^^^^^^^^^^^^^^^^
// allocation of the request-object
...
retval = zfcp_fsf_req_send(req);
...
spin_unlock_irq(&qdio->req_q_lock);
return retval;
}
static int zfcp_fsf_req_send(struct zfcp_fsf_req *req)
{
struct zfcp_adapter *adapter = req->adapter;
struct zfcp_qdio *qdio = adapter->qdio;
...
zfcp_reqlist_add(adapter->req_list, req);
// ^^^^^^^^^^^^^^^^
// add request to our driver-internal hash-table for tracking
// (protected by separate lock req_list->lock)
...
if (zfcp_qdio_send(qdio, &req->qdio_req)) {
// ^^^^^^^^^^^^^^
// hand-off the request to FCP channel;
// the request can complete at any point now
...
}
/* Don't increase for unsolicited status */
if (!zfcp_fsf_req_is_status_read_buffer(req))
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// possible use-after-free
adapter->fsf_req_seq_no++;
// ^^^^^^^^^^^^^^^^
// because of the use-after-free we might
// miss this accounting, and as follow-up
// this results in the FCP channel error
// FSF_PROT_SEQ_NUMB_ERROR
adapter->req_no++;
return 0;
}
static inline bool
zfcp_fsf_req_is_status_read_buffer(struct zfcp_fsf_req *req)
{
return req->qtcb == NULL;
// ^^^^^^^^^
// possible use-after-free
}
Response Path:
void zfcp_fsf_reqid_check(struct zfcp_qdio *qdio, int sbal_idx)
{
...
struct zfcp_fsf_req *fsf_req;
...
for (idx = 0; idx < QDIO_MAX_ELEMENTS_PER_BUFFER; idx++) {
...
fsf_req = zfcp_reqlist_find_rm(adapter->req_list,
req_id);
// ^^^^^^^^^^^^^^^^^^^^
// remove request from our driver-internal
// hash-table (lock req_list->lock)
...
zfcp_fsf_req_complete(fsf_req);
}
}
static void zfcp_fsf_req_complete(struct zfcp_fsf_req *req)
{
...
if (likely(req->status & ZFCP_STATUS_FSFREQ_CLEANUP))
zfcp_fsf_req_free(req);
// ^^^^^^^^^^^^^^^^^
// free memory for request-object
else
complete(&req->completion);
// ^^^^^^^^
// completion notification for code-paths that wait
// synchronous for the completion of the request; in
// those the memory is freed separately
}
The result of the use-after-free only affects the send path, and can not
lead to any data corruption. In case we miss the sequence-number
accounting, because the memory was already re-purposed, the next FSF
command will fail with said FCP channel error, and we will recover the
whole adapter. This causes no additional errors, but it slows down traffic.
There is a slight chance of the same thing happen again recursively after
the adapter recovery, but so far this has not been seen.
This was seen under z/VM, where the send path might run on a virtual CPU
that gets scheduled away by z/VM, while the return path might still run,
and so create the necessary timing. Running with KASAN can also slow down
the kernel sufficiently to run into this user-after-free, and then see the
report by KASAN.
To fix this, simply pull the test for the sequence-number accounting in
front of the hand-off to the FCP channel (this information doesn't change
during hand-off), but leave the sequence-number accounting itself where it
is.
To make future regressions of the same kind less likely, add comments to
all closely related code-paths.
Signed-off-by: Benjamin Block <bblock(a)linux.ibm.com>
Fixes: f9eca0227600 ("scsi: zfcp: drop duplicate fsf_command from zfcp_fsf_req which is also in QTCB header")
Cc: <stable(a)vger.kernel.org> #5.0+
Reviewed-by: Steffen Maier <maier(a)linux.ibm.com>
Reviewed-by: Jens Remus <jremus(a)linux.ibm.com>
---
drivers/s390/scsi/zfcp_fsf.c | 45 ++++++++++++++++++++++++++++++++----
1 file changed, 40 insertions(+), 5 deletions(-)
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index d94496ee6883..c5b2615b49ef 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/blktrace_api.h>
+#include <linux/types.h>
#include <linux/slab.h>
#include <scsi/fc/fc_els.h>
#include "zfcp_ext.h"
@@ -741,6 +742,7 @@ static struct zfcp_fsf_req *zfcp_fsf_req_create(struct zfcp_qdio *qdio,
static int zfcp_fsf_req_send(struct zfcp_fsf_req *req)
{
+ const bool is_srb = zfcp_fsf_req_is_status_read_buffer(req);
struct zfcp_adapter *adapter = req->adapter;
struct zfcp_qdio *qdio = adapter->qdio;
int req_id = req->req_id;
@@ -757,8 +759,20 @@ static int zfcp_fsf_req_send(struct zfcp_fsf_req *req)
return -EIO;
}
+ /*
+ * NOTE: DO NOT TOUCH ASYNC req PAST THIS POINT.
+ * ONLY TOUCH SYNC req AGAIN ON req->completion.
+ *
+ * The request might complete and be freed concurrently at any point
+ * now. This is not protected by the QDIO-lock (req_q_lock). So any
+ * uncontrolled access after this might result in an use-after-free bug.
+ * Only if the request doesn't have ZFCP_STATUS_FSFREQ_CLEANUP set, and
+ * when it is completed via req->completion, is it safe to use req
+ * again.
+ */
+
/* Don't increase for unsolicited status */
- if (!zfcp_fsf_req_is_status_read_buffer(req))
+ if (!is_srb)
adapter->fsf_req_seq_no++;
adapter->req_no++;
@@ -805,6 +819,7 @@ int zfcp_fsf_status_read(struct zfcp_qdio *qdio)
retval = zfcp_fsf_req_send(req);
if (retval)
goto failed_req_send;
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
goto out;
@@ -914,8 +929,10 @@ struct zfcp_fsf_req *zfcp_fsf_abort_fcp_cmnd(struct scsi_cmnd *scmnd)
req->qtcb->bottom.support.req_handle = (u64) old_req_id;
zfcp_fsf_start_timer(req, ZFCP_FSF_SCSI_ER_TIMEOUT);
- if (!zfcp_fsf_req_send(req))
+ if (!zfcp_fsf_req_send(req)) {
+ /* NOTE: DO NOT TOUCH req, UNTIL IT COMPLETES! */
goto out;
+ }
out_error_free:
zfcp_fsf_req_free(req);
@@ -1098,6 +1115,7 @@ int zfcp_fsf_send_ct(struct zfcp_fc_wka_port *wka_port,
ret = zfcp_fsf_req_send(req);
if (ret)
goto failed_send;
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
goto out;
@@ -1198,6 +1216,7 @@ int zfcp_fsf_send_els(struct zfcp_adapter *adapter, u32 d_id,
ret = zfcp_fsf_req_send(req);
if (ret)
goto failed_send;
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
goto out;
@@ -1243,6 +1262,7 @@ int zfcp_fsf_exchange_config_data(struct zfcp_erp_action *erp_action)
zfcp_fsf_req_free(req);
erp_action->fsf_req_id = 0;
}
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
return retval;
@@ -1279,8 +1299,10 @@ int zfcp_fsf_exchange_config_data_sync(struct zfcp_qdio *qdio,
zfcp_fsf_start_timer(req, ZFCP_FSF_REQUEST_TIMEOUT);
retval = zfcp_fsf_req_send(req);
spin_unlock_irq(&qdio->req_q_lock);
- if (!retval)
+ if (!retval) {
+ /* NOTE: ONLY TOUCH SYNC req AGAIN ON req->completion. */
wait_for_completion(&req->completion);
+ }
zfcp_fsf_req_free(req);
return retval;
@@ -1330,6 +1352,7 @@ int zfcp_fsf_exchange_port_data(struct zfcp_erp_action *erp_action)
zfcp_fsf_req_free(req);
erp_action->fsf_req_id = 0;
}
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
return retval;
@@ -1372,8 +1395,10 @@ int zfcp_fsf_exchange_port_data_sync(struct zfcp_qdio *qdio,
retval = zfcp_fsf_req_send(req);
spin_unlock_irq(&qdio->req_q_lock);
- if (!retval)
+ if (!retval) {
+ /* NOTE: ONLY TOUCH SYNC req AGAIN ON req->completion. */
wait_for_completion(&req->completion);
+ }
zfcp_fsf_req_free(req);
@@ -1493,6 +1518,7 @@ int zfcp_fsf_open_port(struct zfcp_erp_action *erp_action)
erp_action->fsf_req_id = 0;
put_device(&port->dev);
}
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
return retval;
@@ -1557,6 +1583,7 @@ int zfcp_fsf_close_port(struct zfcp_erp_action *erp_action)
zfcp_fsf_req_free(req);
erp_action->fsf_req_id = 0;
}
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
return retval;
@@ -1626,6 +1653,7 @@ int zfcp_fsf_open_wka_port(struct zfcp_fc_wka_port *wka_port)
retval = zfcp_fsf_req_send(req);
if (retval)
zfcp_fsf_req_free(req);
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
if (!retval)
@@ -1681,6 +1709,7 @@ int zfcp_fsf_close_wka_port(struct zfcp_fc_wka_port *wka_port)
retval = zfcp_fsf_req_send(req);
if (retval)
zfcp_fsf_req_free(req);
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
if (!retval)
@@ -1776,6 +1805,7 @@ int zfcp_fsf_close_physical_port(struct zfcp_erp_action *erp_action)
zfcp_fsf_req_free(req);
erp_action->fsf_req_id = 0;
}
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
return retval;
@@ -1899,6 +1929,7 @@ int zfcp_fsf_open_lun(struct zfcp_erp_action *erp_action)
zfcp_fsf_req_free(req);
erp_action->fsf_req_id = 0;
}
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
return retval;
@@ -1987,6 +2018,7 @@ int zfcp_fsf_close_lun(struct zfcp_erp_action *erp_action)
zfcp_fsf_req_free(req);
erp_action->fsf_req_id = 0;
}
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
out:
spin_unlock_irq(&qdio->req_q_lock);
return retval;
@@ -2299,6 +2331,7 @@ int zfcp_fsf_fcp_cmnd(struct scsi_cmnd *scsi_cmnd)
retval = zfcp_fsf_req_send(req);
if (unlikely(retval))
goto failed_scsi_cmnd;
+ /* NOTE: DO NOT TOUCH req PAST THIS POINT! */
goto out;
@@ -2373,8 +2406,10 @@ struct zfcp_fsf_req *zfcp_fsf_fcp_task_mgmt(struct scsi_device *sdev,
zfcp_fc_fcp_tm(fcp_cmnd, sdev, tm_flags);
zfcp_fsf_start_timer(req, ZFCP_FSF_SCSI_ER_TIMEOUT);
- if (!zfcp_fsf_req_send(req))
+ if (!zfcp_fsf_req_send(req)) {
+ /* NOTE: DO NOT TOUCH req, UNTIL IT COMPLETES! */
goto out;
+ }
zfcp_fsf_req_free(req);
req = NULL;
--
2.17.1
From: Wanpeng Li <wanpengli(a)tencent.com>
Thomas reported that:
| Background:
|
| In preparation of supporting IPI shorthands I changed the CPU offline
| code to software disable the local APIC instead of just masking it.
| That's done by clearing the APIC_SPIV_APIC_ENABLED bit in the APIC_SPIV
| register.
|
| Failure:
|
| When the CPU comes back online the startup code triggers occasionally
| the warning in apic_pending_intr_clear(). That complains that the IRRs
| are not empty.
|
| The offending vector is the local APIC timer vector who's IRR bit is set
| and stays set.
|
| It took me quite some time to reproduce the issue locally, but now I can
| see what happens.
|
| It requires apicv_enabled=0, i.e. full apic emulation. With apicv_enabled=1
| (and hardware support) it behaves correctly.
|
| Here is the series of events:
|
| Guest CPU
|
| goes down
|
| native_cpu_disable()
|
| apic_soft_disable();
|
| play_dead()
|
| ....
|
| startup()
|
| if (apic_enabled())
| apic_pending_intr_clear() <- Not taken
|
| enable APIC
|
| apic_pending_intr_clear() <- Triggers warning because IRR is stale
|
| When this happens then the deadline timer or the regular APIC timer -
| happens with both, has fired shortly before the APIC is disabled, but the
| interrupt was not serviced because the guest CPU was in an interrupt
| disabled region at that point.
|
| The state of the timer vector ISR/IRR bits:
|
| ISR IRR
| before apic_soft_disable() 0 1
| after apic_soft_disable() 0 1
|
| On startup 0 1
|
| Now one would assume that the IRR is cleared after the INIT reset, but this
| happens only on CPU0.
|
| Why?
|
| Because our CPU0 hotplug is just for testing to make sure nothing breaks
| and goes through an NMI wakeup vehicle because INIT would send it through
| the boots-trap code which is not really working if that CPU was not
| physically unplugged.
|
| Now looking at a real world APIC the situation in that case is:
|
| ISR IRR
| before apic_soft_disable() 0 1
| after apic_soft_disable() 0 1
|
| On startup 0 0
|
| Why?
|
| Once the dying CPU reenables interrupts the pending interrupt gets
| delivered as a spurious interupt and then the state is clear.
|
| While that CPU0 hotplug test case is surely an esoteric issue, the APIC
| emulation is still wrong, Even if the play_dead() code would not enable
| interrupts then the pending IRR bit would turn into an ISR .. interrupt
| when the APIC is reenabled on startup.
>From SDM 10.4.7.2 Local APIC State After It Has Been Software Disabled
* Pending interrupts in the IRR and ISR registers are held and require
masking or handling by the CPU.
In Thomas's testing, hardware cpu will not respect soft disable LAPIC
when IRR has already been set or APICv posted-interrupt is in flight,
so we can skip soft disable APIC checking when clearing IRR and set ISR,
continue to respect soft disable APIC when attempting to set IRR.
Reported-by: Rong Chen <rong.a.chen(a)intel.com>
Reported-by: Feng Tang <feng.tang(a)intel.com>
Reported-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Radim Krčmář <rkrcmar(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Rong Chen <rong.a.chen(a)intel.com>
Cc: Feng Tang <feng.tang(a)intel.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Wanpeng Li <wanpengli(a)tencent.com>
---
arch/x86/kvm/lapic.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 05d8934..f857a12 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2376,7 +2376,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (!apic_enabled(apic))
+ if (!kvm_apic_hw_enabled(apic))
return -1;
__apic_update_ppr(apic, &ppr);
--
2.7.4
Allow userspace to set a custom value for the VMFUNC controls MSR, as long
as the capabilities it advertises do not exceed those of the host.
Fixes: 27c42a1bb ("KVM: nVMX: Enable VMFUNC for the L1 hypervisor", 2017-08-03)
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kvm/vmx/nested.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index c4e29ef0b21e..163d226efa96 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1234,6 +1234,11 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
case MSR_IA32_VMX_VMCS_ENUM:
vmx->nested.msrs.vmcs_enum = data;
return 0;
+ case MSR_IA32_VMX_VMFUNC:
+ if (data & ~vmx->nested.msrs.vmfunc_controls)
+ return -EINVAL;
+ vmx->nested.msrs.vmfunc_controls = data;
+ return 0;
default:
/*
* The rest of the VMX capability MSRs do not support restore.
--
1.8.3.1
This allows userspace to know which MSRs are supported by the hypervisor.
Unfortunately userspace must resort to tricks for everything except
MSR_IA32_VMX_VMFUNC (which was just added in the previous patch).
One possibility is to use the feature control MSR, which is tied to nested
VMX as well and is present on all KVM versions that support feature MSRs.
Fixes: 1389309c811 ("KVM: nVMX: expose VMX capabilities for nested hypervisors to userspace", 2018-02-26)
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kvm/svm.c | 1 +
arch/x86/kvm/vmx/vmx.c | 2 ++
arch/x86/kvm/x86.c | 20 ++++++++++++++++++++
3 files changed, 23 insertions(+)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bbc31f7213ed..5db50c19d1c7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5885,6 +5885,7 @@ static bool svm_has_emulated_msr(int index)
{
switch (index) {
case MSR_IA32_MCG_EXT_CTL:
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return false;
default:
break;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a35459ce7e29..c43635942693 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6223,6 +6223,8 @@ static bool vmx_has_emulated_msr(int index)
* real mode.
*/
return enable_unrestricted_guest || emulate_invalid_guest_state;
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
/* This is AMD only. */
return false;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8996a3131116..a02d4c244422 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1177,6 +1177,26 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
MSR_AMD64_VIRT_SPEC_CTRL,
MSR_IA32_POWER_CTL,
+ /*
+ * The following list leaves out MSRs whose values are determined
+ * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
+ * We always support the "true" VMX control MSRs, even if the host
+ * processor does not, so I am putting these registers here rather
+ * than in msrs_to_save.
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
MSR_K7_HWCR,
MSR_KVM_POLL_CONTROL,
};
--
1.8.3.1
On S2MPS11 device, the buck7 and buck8 regulator voltages start at 750
mV, not 600 mV. Using wrong minimal value caused shifting of these
regulator values by 150 mV (e.g. buck7 usually configured to v1.35 V was
reported as 1.2 V).
On most of the boards these regulators are left in default state so this
was only affecting reported voltage. However if any driver wanted to
change them, then effectively it would set voltage 150 mV higher than
intended.
Cc: <stable(a)vger.kernel.org>
Fixes: cb74685ecb39 ("regulator: s2mps11: Add samsung s2mps11 regulator driver")
Signed-off-by: Krzysztof Kozlowski <krzk(a)kernel.org>
---
drivers/regulator/s2mps11.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index a215cfe20555..8f3c8730dc0d 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -479,8 +479,8 @@ static const struct regulator_desc s2mps11_regulators[] = {
regulator_desc_s2mps11_buck1_4(4),
regulator_desc_s2mps11_buck5,
regulator_desc_s2mps11_buck67810(6, MIN_600_MV, STEP_6_25_MV),
- regulator_desc_s2mps11_buck67810(7, MIN_600_MV, STEP_12_5_MV),
- regulator_desc_s2mps11_buck67810(8, MIN_600_MV, STEP_12_5_MV),
+ regulator_desc_s2mps11_buck67810(7, MIN_750_MV, STEP_12_5_MV),
+ regulator_desc_s2mps11_buck67810(8, MIN_750_MV, STEP_12_5_MV),
regulator_desc_s2mps11_buck9,
regulator_desc_s2mps11_buck67810(10, MIN_750_MV, STEP_12_5_MV),
};
--
2.17.1