Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections.
Here's the code in start_xmit where we check to see if the connection is up:
if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } }
The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
The patch re-checks ipoib_cm_up with priv->lock held to avoid this race condition. Since odds are the conn should be up most of the time (and thus the connection *not* down most of the time) we don't hold the lock for the first check attempt to avoid a slowdown from unecessary locking for the majority of the packets transmitted during the connection's life.
Signed-off-by: Aaron Knister aaron.s.knister@nasa.gov --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 26cde95b..a950c916 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_unlock_irqrestore(&priv->lock, flags); }
+static bool defer_neigh_skb(struct sk_buff *skb, + struct net_device *dev, + struct ipoib_neigh *neigh, + struct ipoib_pseudo_header *phdr) +{ + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + push_pseudo_header(skb, phdr->hwaddr); + __skb_queue_tail(&neigh->queue, skb); + return true; + } + + return false; +} + + static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags; + bool deferred_pkt = true;
phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr)); @@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } + /* + * Re-check ipoib_cm_up with priv->lock held to avoid + * race condition between start_xmit and skb_dequeue in + * cm_rep_handler. Since odds are the conn should be up + * most of the time, we don't hold the lock for the + * first check above + */ + spin_lock_irqsave(&priv->lock, flags); + if (ipoib_cm_up(neigh)) { + spin_unlock_irqrestore(&priv->lock, flags); + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + } else { + deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr); + spin_unlock_irqrestore(&priv->lock, flags); + } + + goto unref; } else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr)); @@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh_refresh_path(neigh, phdr->hwaddr, dev); }
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - push_pseudo_header(skb, phdr->hwaddr); - spin_lock_irqsave(&priv->lock, flags); - __skb_queue_tail(&neigh->queue, skb); - spin_unlock_irqrestore(&priv->lock, flags); - } else { + spin_lock_irqsave(&priv->lock, flags); + deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr); + spin_unlock_irqrestore(&priv->lock, flags); + +unref: + if (!deferred_pkt) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
-unref: ipoib_neigh_put(neigh);
return NETDEV_TX_OK;
On Fri, Aug 17, 2018 at 05:31:26PM -0400, Aaron Knister wrote:
Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections.
Here's the code in start_xmit where we check to see if the connection is up:
if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } }
The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
The patch re-checks ipoib_cm_up with priv->lock held to avoid this race condition. Since odds are the conn should be up most of the time (and thus the connection *not* down most of the time) we don't hold the lock for the first check attempt to avoid a slowdown from unecessary locking for the majority of the packets transmitted during the connection's life.
Signed-off-by: Aaron Knister aaron.s.knister@nasa.gov
If you want this fix in stable@, you will need to provide Fixes line here and don't add stable@ in CC-list.
Thanks, Reviewed-by: Leon Romanovsky leonro@mellanox.com
-----Original Message----- From: Leon Romanovsky [mailto:leon@kernel.org] Sent: Sunday, August 19, 2018 10:29 AM To: Aaron Knister aaron.s.knister@nasa.gov Cc: linux-rdma@vger.kernel.org; stable@vger.kernel.org; Weiny, Ira ira.weiny@intel.com; Fleck, John john.fleck@intel.com Subject: Re: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
On Fri, Aug 17, 2018 at 05:31:26PM -0400, Aaron Knister wrote:
Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections.
Here's the code in start_xmit where we check to see if the connection is up:
if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } }
The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
{
push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
The patch re-checks ipoib_cm_up with priv->lock held to avoid this race condition. Since odds are the conn should be up most of the time (and thus the connection *not* down most of the time) we don't hold the lock for the first check attempt to avoid a slowdown from unecessary locking for the majority of the packets transmitted during the connection's life.
Signed-off-by: Aaron Knister aaron.s.knister@nasa.gov
If you want this fix in stable@, you will need to provide Fixes line here and don't add stable@ in CC-list.
Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup in xmit path") Cc: stable@vger.kernel.org
Thanks, Reviewed-by: Leon Romanovsky leonro@mellanox.com
-----Original Message----- From: Aaron Knister [mailto:aaron.s.knister@nasa.gov] Sent: Friday, August 17, 2018 2:31 PM To: linux-rdma@vger.kernel.org Cc: stable@vger.kernel.org; Weiny, Ira ira.weiny@intel.com; Fleck, John john.fleck@intel.com Subject: [PATCH v2] avoid race condition between start_xmit and cm_rep_handler
Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections.
Here's the code in start_xmit where we check to see if the connection is up:
if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } }
The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
The patch re-checks ipoib_cm_up with priv->lock held to avoid this race condition. Since odds are the conn should be up most of the time (and thus the connection *not* down most of the time) we don't hold the lock for the first check attempt to avoid a slowdown from unecessary locking for the majority of the packets transmitted during the connection's life.
Signed-off-by: Aaron Knister aaron.s.knister@nasa.gov
Tested-by: Ira Weiny ira.weiny@intel.com
drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 26cde95b..a950c916 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_unlock_irqrestore(&priv->lock, flags); }
+static bool defer_neigh_skb(struct sk_buff *skb,
struct net_device *dev,
struct ipoib_neigh *neigh,
struct ipoib_pseudo_header *phdr) {
- if (skb_queue_len(&neigh->queue) <
IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
__skb_queue_tail(&neigh->queue, skb);
return true;
- }
- return false;
+}
static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags;
bool deferred_pkt = true;
phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr));
@@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; }
/*
* Re-check ipoib_cm_up with priv->lock held to avoid
* race condition between start_xmit and skb_dequeue in
* cm_rep_handler. Since odds are the conn should be up
* most of the time, we don't hold the lock for the
* first check above
*/
spin_lock_irqsave(&priv->lock, flags);
if (ipoib_cm_up(neigh)) {
spin_unlock_irqrestore(&priv->lock, flags);
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
} else {
deferred_pkt = defer_neigh_skb(skb, dev, neigh,
phdr);
spin_unlock_irqrestore(&priv->lock, flags);
}
} else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr));goto unref;
@@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh_refresh_path(neigh, phdr->hwaddr, dev); }
- if (skb_queue_len(&neigh->queue) <
IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
- } else {
- spin_lock_irqsave(&priv->lock, flags);
- deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
- spin_unlock_irqrestore(&priv->lock, flags);
+unref:
- if (!deferred_pkt) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
-unref: ipoib_neigh_put(neigh);
return NETDEV_TX_OK;
2.12.3
Hi,
Did you check the option to hold the netif_tx_lock_xxx() in ipoib_cm_rep_handler function (over the line set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
Thanks,
On Sat, Aug 18, 2018 at 12:31 AM, Aaron Knister aaron.s.knister@nasa.gov wrote:
Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections.
Here's the code in start_xmit where we check to see if the connection is up:
if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } }
The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
The patch re-checks ipoib_cm_up with priv->lock held to avoid this race condition. Since odds are the conn should be up most of the time (and thus the connection *not* down most of the time) we don't hold the lock for the first check attempt to avoid a slowdown from unecessary locking for the majority of the packets transmitted during the connection's life.
Signed-off-by: Aaron Knister aaron.s.knister@nasa.gov
drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 26cde95b..a950c916 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_unlock_irqrestore(&priv->lock, flags); }
+static bool defer_neigh_skb(struct sk_buff *skb,
struct net_device *dev,
struct ipoib_neigh *neigh,
struct ipoib_pseudo_header *phdr)
+{
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
__skb_queue_tail(&neigh->queue, skb);
return true;
}
return false;
+}
static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags;
bool deferred_pkt = true; phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr));
@@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; }
/*
* Re-check ipoib_cm_up with priv->lock held to avoid
* race condition between start_xmit and skb_dequeue in
* cm_rep_handler. Since odds are the conn should be up
* most of the time, we don't hold the lock for the
* first check above
*/
spin_lock_irqsave(&priv->lock, flags);
if (ipoib_cm_up(neigh)) {
spin_unlock_irqrestore(&priv->lock, flags);
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
} else {
deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
spin_unlock_irqrestore(&priv->lock, flags);
}
goto unref; } else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr));
@@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh_refresh_path(neigh, phdr->hwaddr, dev); }
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
} else {
spin_lock_irqsave(&priv->lock, flags);
deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
spin_unlock_irqrestore(&priv->lock, flags);
+unref:
if (!deferred_pkt) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
-unref: ipoib_neigh_put(neigh);
return NETDEV_TX_OK;
-- 2.12.3
On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
Hi,
Did you check the option to hold the netif_tx_lock_xxx() in ipoib_cm_rep_handler function (over the line set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
That does seem better, then the test_bit in the datapath could become non-atomic too :)
Jason
On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
Hi,
Did you check the option to hold the netif_tx_lock_xxx() in ipoib_cm_rep_handler function (over the line set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
That does seem better, then the test_bit in the datapath could become non-atomic too :)
Jason
Thanks for the feedback! I've not tried that but I certainly can.
-Aaron
It seems to fix the issue based on tests with my synthetic reproducer which consists of two carefully placed sleeps-- one in start_xmit and another before cm_rep_handler to force open the race window. In order to test it organically I need to commandeer many hundreds of nodes in our cluster which can be fairly disruptive to our user community. I'll send over v3 as an RFC and it would be great to get feedback about whether or not the patch is acceptable. If it is, I'll work on scheduling an at-scale test after which I'll submit v3. I think the odds are very low the synthetic reproducer would indicate this problem as fixed but the real world test would still experience the problem. I try to be thorough, though.
-Aaron
On 8/20/18 1:40 PM, Aaron Knister wrote:
On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
Hi,
Did you check the option to hold the netif_tx_lock_xxx() in ipoib_cm_rep_handler function (over the line set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
That does seem better, then the test_bit in the datapath could become non-atomic too :)
Jason
Thanks for the feedback! I've not tried that but I certainly can.
-Aaron
I had a small window to test the v3 patch (applied to the 4.4.120 kernel) at scale and it does appear to fix the issue, so that's good. One area of concern, though, is the testing data suggests (and this isn't really scientific at all since there are many variables) the v3 patch appears to be 10% slower than the v2 patch. The "organic" test involves creating a large number of empty files in a GPFS filesystem and exacerbating the race condition with GPFS RPC traffic. My sample sizes aren't large, though (26 iterations for the v2 patch and 8 for the v3 patch due to time constraints). I don't really have a preference for either approach. What do folks think?
-Aaron
On 8/20/18 3:20 PM, Aaron Knister wrote:
It seems to fix the issue based on tests with my synthetic reproducer which consists of two carefully placed sleeps-- one in start_xmit and another before cm_rep_handler to force open the race window. In order to test it organically I need to commandeer many hundreds of nodes in our cluster which can be fairly disruptive to our user community. I'll send over v3 as an RFC and it would be great to get feedback about whether or not the patch is acceptable. If it is, I'll work on scheduling an at-scale test after which I'll submit v3. I think the odds are very low the synthetic reproducer would indicate this problem as fixed but the real world test would still experience the problem. I try to be thorough, though.
-Aaron
On 8/20/18 1:40 PM, Aaron Knister wrote:
On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
Hi,
Did you check the option to hold the netif_tx_lock_xxx() in ipoib_cm_rep_handler function (over the line set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
That does seem better, then the test_bit in the datapath could become non-atomic too :)
Jason
Thanks for the feedback! I've not tried that but I certainly can.
-Aaron
I just had a chance to do additional testing of the v3 patch. Giving it 30 new test iterations showed it's performance (based on the "organic" reproducer) to be on par with and marginally better than the v2 patch. With that said, I'll formally submit the v3 patch.
-Aaron
On 8/23/18 7:25 PM, Aaron Knister wrote:
I had a small window to test the v3 patch (applied to the 4.4.120 kernel) at scale and it does appear to fix the issue, so that's good. One area of concern, though, is the testing data suggests (and this isn't really scientific at all since there are many variables) the v3 patch appears to be 10% slower than the v2 patch. The "organic" test involves creating a large number of empty files in a GPFS filesystem and exacerbating the race condition with GPFS RPC traffic. My sample sizes aren't large, though (26 iterations for the v2 patch and 8 for the v3 patch due to time constraints). I don't really have a preference for either approach. What do folks think?
-Aaron
On 8/20/18 3:20 PM, Aaron Knister wrote:
It seems to fix the issue based on tests with my synthetic reproducer which consists of two carefully placed sleeps-- one in start_xmit and another before cm_rep_handler to force open the race window. In order to test it organically I need to commandeer many hundreds of nodes in our cluster which can be fairly disruptive to our user community. I'll send over v3 as an RFC and it would be great to get feedback about whether or not the patch is acceptable. If it is, I'll work on scheduling an at-scale test after which I'll submit v3. I think the odds are very low the synthetic reproducer would indicate this problem as fixed but the real world test would still experience the problem. I try to be thorough, though.
-Aaron
On 8/20/18 1:40 PM, Aaron Knister wrote:
On 8/20/18 12:28 PM, Jason Gunthorpe wrote:
On Mon, Aug 20, 2018 at 09:36:53AM +0300, Erez Shitrit wrote:
Hi,
Did you check the option to hold the netif_tx_lock_xxx() in ipoib_cm_rep_handler function (over the line set_bit(IPOIB_FLAG_OPER_UP)) instead of in the send data path flow?
That does seem better, then the test_bit in the datapath could become non-atomic too :)
Jason
Thanks for the feedback! I've not tried that but I certainly can.
-Aaron
Inside of start_xmit() the call to check if the connection is up and the queueing of the packets for later transmission is not atomic which leaves a window where cm_rep_handler can run, set the connection up, dequeue pending packets and leave the subsequently queued packets by start_xmit() sitting on neigh->queue until they're dropped when the connection is torn down. This only applies to connected mode. These dropped packets can really upset TCP, for example, and cause multi-minute delays in transmission for open connections.
Here's the code in start_xmit where we check to see if the connection is up:
if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } }
The race occurs if cm_rep_handler execution occurs after the above connection check (specifically if it gets to the point where it acquires priv->lock to dequeue pending skb's) but before the below code snippet in start_xmit where packets are queued.
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
The patch re-checks ipoib_cm_up with priv->lock held to avoid this race condition. Since odds are the conn should be up most of the time (and thus the connection *not* down most of the time) we don't hold the lock for the first check attempt to avoid a slowdown from unecessary locking for the majority of the packets transmitted during the connection's life.
Signed-off-by: Aaron Knister aaron.s.knister@nasa.gov
Reviewed-by: Alex Estrin alex.estrin@intel.com
drivers/infiniband/ulp/ipoib/ipoib_main.c | 46 ++++++++++++++++++++++++++-
1 file changed, 39 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 26cde95b..a950c916 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1093,6 +1093,21 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_unlock_irqrestore(&priv->lock, flags); }
+static bool defer_neigh_skb(struct sk_buff *skb,
struct net_device *dev,
struct ipoib_neigh *neigh,
struct ipoib_pseudo_header *phdr)
+{
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
__skb_queue_tail(&neigh->queue, skb);
return true;
- }
- return false;
+}
static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -1101,6 +1116,7 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags;
bool deferred_pkt = true;
phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr));
@@ -1160,6 +1176,23 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; }
/*
* Re-check ipoib_cm_up with priv->lock held to avoid
* race condition between start_xmit and skb_dequeue in
* cm_rep_handler. Since odds are the conn should be up
* most of the time, we don't hold the lock for the
* first check above
*/
spin_lock_irqsave(&priv->lock, flags);
if (ipoib_cm_up(neigh)) {
spin_unlock_irqrestore(&priv->lock, flags);
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
} else {
deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
spin_unlock_irqrestore(&priv->lock, flags);
}
} else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr));goto unref;
@@ -1168,17 +1201,16 @@ static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) neigh_refresh_path(neigh, phdr->hwaddr, dev); }
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
push_pseudo_header(skb, phdr->hwaddr);
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
- } else {
- spin_lock_irqsave(&priv->lock, flags);
- deferred_pkt = defer_neigh_skb(skb, dev, neigh, phdr);
- spin_unlock_irqrestore(&priv->lock, flags);
+unref:
- if (!deferred_pkt) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); }
-unref: ipoib_neigh_put(neigh);
return NETDEV_TX_OK;
2.12.3
linux-stable-mirror@lists.linaro.org