Hi Greg,
Several peoples have reported the hung-task detector complains about an uninterruptible sleep in ccp driver with recent BIOS updates on Ryzen systems. During debug we found that recent BIOS updates have wrongly advertised that it supports SEV feature when it does not. We have made driver a bit more robust against this firmware bug. The patch is accepted in Linus tree for this. Due to some offset changes (and code restructure) I was not able to use upstream patch as-is. Doing a separate patch for 4.18, 4.17 and 4.16. Please let me know if you have any questions.
~ Brijesh
commit 3702a0585e64d70d5bf73bf3e943b8d6005b72c1 upstream.
Currently, the CCP driver assumes that the SEV command issued to the PSP will always return (i.e. it will never hang). But recently, firmware bugs have shown that a command can hang. Since of the SEV commands are used in probe routines, this can cause boot hangs and/or loss of virtualization capabilities.
To protect against firmware bugs, add a timeout in the SEV command execution flow. If a command does not complete within the specified timeout then return -ETIMEOUT and stop the driver from executing any further commands since the state of the SEV firmware is unknown.
Cc: Tom Lendacky thomas.lendacky@amd.com Cc: Gary Hook Gary.Hook@amd.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: linux-kernel@vger.kernel.orga Cc: stable@vger.kernel.org # 4.16.x: f426d2b20f1c: crypto: ccp - Fix command completion detection race [Brijesh: Backported to 4.16..4.19 - offset change in few hunks] Signed-off-by: Brijesh Singh brijesh.singh@amd.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au --- drivers/crypto/ccp/psp-dev.c | 46 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-)
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 906daba..883f100 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -32,6 +32,17 @@ static DEFINE_MUTEX(sev_cmd_mutex); static struct sev_misc_dev *misc_dev; static struct psp_device *psp_master;
+static int psp_cmd_timeout = 100; +module_param(psp_cmd_timeout, int, 0644); +MODULE_PARM_DESC(psp_cmd_timeout, " default timeout value, in seconds, for PSP commands"); + +static int psp_probe_timeout = 5; +module_param(psp_probe_timeout, int, 0644); +MODULE_PARM_DESC(psp_probe_timeout, " default timeout value, in seconds, during PSP device probe"); + +static bool psp_dead; +static int psp_timeout; + static struct psp_device *psp_alloc_struct(struct sp_device *sp) { struct device *dev = sp->dev; @@ -76,10 +87,19 @@ static irqreturn_t psp_irq_handler(int irq, void *data) return IRQ_HANDLED; }
-static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg) +static int sev_wait_cmd_ioc(struct psp_device *psp, + unsigned int *reg, unsigned int timeout) { - wait_event(psp->sev_int_queue, psp->sev_int_rcvd); + int ret; + + ret = wait_event_timeout(psp->sev_int_queue, + psp->sev_int_rcvd, timeout * HZ); + if (!ret) + return -ETIMEDOUT; + *reg = ioread32(psp->io_regs + PSP_CMDRESP); + + return 0; }
static int sev_cmd_buffer_len(int cmd) @@ -125,12 +145,15 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) if (!psp) return -ENODEV;
+ if (psp_dead) + return -EBUSY; + /* Get the physical address of the command buffer */ phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0; phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
- dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x\n", - cmd, phys_msb, phys_lsb); + dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n", + cmd, phys_msb, phys_lsb, psp_timeout);
print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data, sev_cmd_buffer_len(cmd), false); @@ -146,7 +169,18 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) iowrite32(reg, psp->io_regs + PSP_CMDRESP);
/* wait for command completion */ - sev_wait_cmd_ioc(psp, ®); + ret = sev_wait_cmd_ioc(psp, ®, psp_timeout); + if (ret) { + if (psp_ret) + *psp_ret = 0; + + dev_err(psp->dev, "sev command %#x timed out, disabling PSP \n", cmd); + psp_dead = true; + + return ret; + } + + psp_timeout = psp_cmd_timeout;
if (psp_ret) *psp_ret = reg & PSP_CMDRESP_ERR_MASK; @@ -773,6 +807,8 @@ void psp_pci_init(void)
psp_master = sp->psp_data;
+ psp_timeout = psp_probe_timeout; + /* Initialize the platform */ rc = sev_platform_init(&error); if (rc) {
commit 3702a0585e64d70d5bf73bf3e943b8d6005b72c1 upstream.
Currently, the CCP driver assumes that the SEV command issued to the PSP will always return (i.e. it will never hang). But recently, firmware bugs have shown that a command can hang. Since of the SEV commands are used in probe routines, this can cause boot hangs and/or loss of virtualization capabilities.
To protect against firmware bugs, add a timeout in the SEV command execution flow. If a command does not complete within the specified timeout then return -ETIMEOUT and stop the driver from executing any further commands since the state of the SEV firmware is unknown.
Cc: Tom Lendacky thomas.lendacky@amd.com Cc: Gary Hook Gary.Hook@amd.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: linux-kernel@vger.kernel.orga Cc: stable@vger.kernel.org # 4.17.x [Brijesh: Backported to 4.17..4.19 - offset change in few hunks] Signed-off-by: Brijesh Singh brijesh.singh@amd.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au --- drivers/crypto/ccp/psp-dev.c | 46 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-)
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 2867612..26fd9f2 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -32,6 +32,17 @@ static DEFINE_MUTEX(sev_cmd_mutex); static struct sev_misc_dev *misc_dev; static struct psp_device *psp_master;
+static int psp_cmd_timeout = 100; +module_param(psp_cmd_timeout, int, 0644); +MODULE_PARM_DESC(psp_cmd_timeout, " default timeout value, in seconds, for PSP commands"); + +static int psp_probe_timeout = 5; +module_param(psp_probe_timeout, int, 0644); +MODULE_PARM_DESC(psp_probe_timeout, " default timeout value, in seconds, during PSP device probe"); + +static bool psp_dead; +static int psp_timeout; + static struct psp_device *psp_alloc_struct(struct sp_device *sp) { struct device *dev = sp->dev; @@ -76,10 +87,19 @@ static irqreturn_t psp_irq_handler(int irq, void *data) return IRQ_HANDLED; }
-static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg) +static int sev_wait_cmd_ioc(struct psp_device *psp, + unsigned int *reg, unsigned int timeout) { - wait_event(psp->sev_int_queue, psp->sev_int_rcvd); + int ret; + + ret = wait_event_timeout(psp->sev_int_queue, + psp->sev_int_rcvd, timeout * HZ); + if (!ret) + return -ETIMEDOUT; + *reg = ioread32(psp->io_regs + PSP_CMDRESP); + + return 0; }
static int sev_cmd_buffer_len(int cmd) @@ -125,12 +145,15 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) if (!psp) return -ENODEV;
+ if (psp_dead) + return -EBUSY; + /* Get the physical address of the command buffer */ phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0; phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
- dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x\n", - cmd, phys_msb, phys_lsb); + dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n", + cmd, phys_msb, phys_lsb, psp_timeout);
print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data, sev_cmd_buffer_len(cmd), false); @@ -146,7 +169,18 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) iowrite32(reg, psp->io_regs + PSP_CMDRESP);
/* wait for command completion */ - sev_wait_cmd_ioc(psp, ®); + ret = sev_wait_cmd_ioc(psp, ®, psp_timeout); + if (ret) { + if (psp_ret) + *psp_ret = 0; + + dev_err(psp->dev, "sev command %#x timed out, disabling PSP \n", cmd); + psp_dead = true; + + return ret; + } + + psp_timeout = psp_cmd_timeout;
if (psp_ret) *psp_ret = reg & PSP_CMDRESP_ERR_MASK; @@ -763,6 +797,8 @@ void psp_pci_init(void)
psp_master = sp->psp_data;
+ psp_timeout = psp_probe_timeout; + /* Initialize the platform */ rc = sev_platform_init(&error); if (rc) {
commit 3702a0585e64d70d5bf73bf3e943b8d6005b72c1 upstream.
Currently, the CCP driver assumes that the SEV command issued to the PSP will always return (i.e. it will never hang). But recently, firmware bugs have shown that a command can hang. Since of the SEV commands are used in probe routines, this can cause boot hangs and/or loss of virtualization capabilities.
To protect against firmware bugs, add a timeout in the SEV command execution flow. If a command does not complete within the specified timeout then return -ETIMEOUT and stop the driver from executing any further commands since the state of the SEV firmware is unknown.
Cc: Tom Lendacky thomas.lendacky@amd.com Cc: Gary Hook Gary.Hook@amd.com Cc: Herbert Xu herbert@gondor.apana.org.au Cc: linux-kernel@vger.kernel.orga Cc: stable@vger.kernel.org # 4.18.x [Brijesh: Backported to 4.18..4.19 - offset change in few hunks] Signed-off-by: Brijesh Singh brijesh.singh@amd.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au --- drivers/crypto/ccp/psp-dev.c | 46 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-)
diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 051b8c6..a9c8509 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -38,6 +38,17 @@ static DEFINE_MUTEX(sev_cmd_mutex); static struct sev_misc_dev *misc_dev; static struct psp_device *psp_master;
+static int psp_cmd_timeout = 100; +module_param(psp_cmd_timeout, int, 0644); +MODULE_PARM_DESC(psp_cmd_timeout, " default timeout value, in seconds, for PSP commands"); + +static int psp_probe_timeout = 5; +module_param(psp_probe_timeout, int, 0644); +MODULE_PARM_DESC(psp_probe_timeout, " default timeout value, in seconds, during PSP device probe"); + +static bool psp_dead; +static int psp_timeout; + static struct psp_device *psp_alloc_struct(struct sp_device *sp) { struct device *dev = sp->dev; @@ -82,10 +93,19 @@ static irqreturn_t psp_irq_handler(int irq, void *data) return IRQ_HANDLED; }
-static void sev_wait_cmd_ioc(struct psp_device *psp, unsigned int *reg) +static int sev_wait_cmd_ioc(struct psp_device *psp, + unsigned int *reg, unsigned int timeout) { - wait_event(psp->sev_int_queue, psp->sev_int_rcvd); + int ret; + + ret = wait_event_timeout(psp->sev_int_queue, + psp->sev_int_rcvd, timeout * HZ); + if (!ret) + return -ETIMEDOUT; + *reg = ioread32(psp->io_regs + PSP_CMDRESP); + + return 0; }
static int sev_cmd_buffer_len(int cmd) @@ -133,12 +153,15 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) if (!psp) return -ENODEV;
+ if (psp_dead) + return -EBUSY; + /* Get the physical address of the command buffer */ phys_lsb = data ? lower_32_bits(__psp_pa(data)) : 0; phys_msb = data ? upper_32_bits(__psp_pa(data)) : 0;
- dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x\n", - cmd, phys_msb, phys_lsb); + dev_dbg(psp->dev, "sev command id %#x buffer 0x%08x%08x timeout %us\n", + cmd, phys_msb, phys_lsb, psp_timeout);
print_hex_dump_debug("(in): ", DUMP_PREFIX_OFFSET, 16, 2, data, sev_cmd_buffer_len(cmd), false); @@ -154,7 +177,18 @@ static int __sev_do_cmd_locked(int cmd, void *data, int *psp_ret) iowrite32(reg, psp->io_regs + PSP_CMDRESP);
/* wait for command completion */ - sev_wait_cmd_ioc(psp, ®); + ret = sev_wait_cmd_ioc(psp, ®, psp_timeout); + if (ret) { + if (psp_ret) + *psp_ret = 0; + + dev_err(psp->dev, "sev command %#x timed out, disabling PSP \n", cmd); + psp_dead = true; + + return ret; + } + + psp_timeout = psp_cmd_timeout;
if (psp_ret) *psp_ret = reg & PSP_CMDRESP_ERR_MASK; @@ -886,6 +920,8 @@ void psp_pci_init(void)
psp_master = sp->psp_data;
+ psp_timeout = psp_probe_timeout; + if (sev_get_api_version()) goto err;
On Wed, Sep 19, 2018 at 09:09:09PM +0000, Singh, Brijesh wrote:
Hi Greg,
Several peoples have reported the hung-task detector complains about an uninterruptible sleep in ccp driver with recent BIOS updates on Ryzen systems. During debug we found that recent BIOS updates have wrongly advertised that it supports SEV feature when it does not. We have made driver a bit more robust against this firmware bug. The patch is accepted in Linus tree for this. Due to some offset changes (and code restructure) I was not able to use upstream patch as-is. Doing a separate patch for 4.18, 4.17 and 4.16. Please let me know if you have any questions.
4.17 and 4.16 are long end-of-life, nothing I can do there, sorry. But I have queued up your 4.18.y backport, thanks for that.
greg k-h
linux-stable-mirror@lists.linaro.org