commit 1f30fb9166d4f15a1aa19449b9da871fe0ed4796 upstream
[Backport for 5.4: Removed handling of OVS_ACTION_ATTR_DEC_TTL as it
doesn't exist in this version. BUILD_BUG_ON condition adjusted
accordingly.]
While parsing user-provided actions, openvswitch module may dynamically
allocate memory and store pointers in the internal copy of the actions.
So this memory has to be freed while destroying the actions.
Currently there are only two such actions: ct() and set(). However,
there are many actions that can hold nested lists of actions and
ovs_nla_free_flow_actions() just jumps over them leaking the memory.
For example, removal of the flow with the following actions will lead
to a leak of the memory allocated by nf_ct_tmpl_alloc():
actions:clone(ct(commit),0)
Non-freed set() action may also leak the 'dst' structure for the
tunnel info including device references.
Under certain conditions with a high rate of flow rotation that may
cause significant memory leak problem (2MB per second in reporter's
case). The problem is also hard to mitigate, because the user doesn't
have direct control over the datapath flows generated by OVS.
Fix that by iterating over all the nested actions and freeing
everything that needs to be freed recursively.
New build time assertion should protect us from this problem if new
actions will be added in the future.
Unfortunately, openvswitch module doesn't use NLA_F_NESTED, so all
attributes has to be explicitly checked. sample() and clone() actions
are mixing extra attributes into the user-provided action list. That
prevents some code generalization too.
Fixes: 34ae932a4036 ("openvswitch: Make tunnel set action attach a metadata dst")
Link: https://mail.openvswitch.org/pipermail/ovs-dev/2022-March/392922.html
Reported-by: Stéphane Graber <stgraber(a)ubuntu.com>
Signed-off-by: Ilya Maximets <i.maximets(a)ovn.org>
Acked-by: Aaron Conole <aconole(a)redhat.com>
Signed-off-by: David S. Miller <davem(a)davemloft.net>
---
The fix was already backported down to 5.10. Sending this slightly modified
version that works on 5.4.
net/openvswitch/flow_netlink.c | 80 +++++++++++++++++++++++++++++++---
1 file changed, 75 insertions(+), 5 deletions(-)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 8461de79f67b..67125939d7ee 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2266,6 +2266,51 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size)
return sfa;
}
+static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len);
+
+static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action)
+{
+ const struct nlattr *a;
+ int rem;
+
+ nla_for_each_nested(a, action, rem) {
+ switch (nla_type(a)) {
+ case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL:
+ case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER:
+ ovs_nla_free_nested_actions(nla_data(a), nla_len(a));
+ break;
+ }
+ }
+}
+
+static void ovs_nla_free_clone_action(const struct nlattr *action)
+{
+ const struct nlattr *a = nla_data(action);
+ int rem = nla_len(action);
+
+ switch (nla_type(a)) {
+ case OVS_CLONE_ATTR_EXEC:
+ /* The real list of actions follows this attribute. */
+ a = nla_next(a, &rem);
+ ovs_nla_free_nested_actions(a, rem);
+ break;
+ }
+}
+
+static void ovs_nla_free_sample_action(const struct nlattr *action)
+{
+ const struct nlattr *a = nla_data(action);
+ int rem = nla_len(action);
+
+ switch (nla_type(a)) {
+ case OVS_SAMPLE_ATTR_ARG:
+ /* The real list of actions follows this attribute. */
+ a = nla_next(a, &rem);
+ ovs_nla_free_nested_actions(a, rem);
+ break;
+ }
+}
+
static void ovs_nla_free_set_action(const struct nlattr *a)
{
const struct nlattr *ovs_key = nla_data(a);
@@ -2279,25 +2324,50 @@ static void ovs_nla_free_set_action(const struct nlattr *a)
}
}
-void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len)
{
const struct nlattr *a;
int rem;
- if (!sf_acts)
+ /* Whenever new actions are added, the need to update this
+ * function should be considered.
+ */
+ BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 21);
+
+ if (!actions)
return;
- nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) {
+ nla_for_each_attr(a, actions, len, rem) {
switch (nla_type(a)) {
- case OVS_ACTION_ATTR_SET:
- ovs_nla_free_set_action(a);
+ case OVS_ACTION_ATTR_CHECK_PKT_LEN:
+ ovs_nla_free_check_pkt_len_action(a);
break;
+
+ case OVS_ACTION_ATTR_CLONE:
+ ovs_nla_free_clone_action(a);
+ break;
+
case OVS_ACTION_ATTR_CT:
ovs_ct_free_action(a);
break;
+
+ case OVS_ACTION_ATTR_SAMPLE:
+ ovs_nla_free_sample_action(a);
+ break;
+
+ case OVS_ACTION_ATTR_SET:
+ ovs_nla_free_set_action(a);
+ break;
}
}
+}
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ if (!sf_acts)
+ return;
+ ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len);
kfree(sf_acts);
}
--
2.34.3
commit 2061ecfdf2350994e5b61c43e50e98a7a70e95ee upstream
[Backport to 5.10: minor rebase in ovs_ct_clear function.
This version also applicable to and tested on 5.4 and 4.19.]
If packet headers changed, the cached nfct is no longer relevant
for the packet and attempt to re-use it leads to the incorrect packet
classification.
This issue is causing broken connectivity in OpenStack deployments
with OVS/OVN due to hairpin traffic being unexpectedly dropped.
The setup has datapath flows with several conntrack actions and tuple
changes between them:
actions:ct(commit,zone=8,mark=0/0x1,nat(src)),
set(eth(src=00:00:00:00:00:01,dst=00:00:00:00:00:06)),
set(ipv4(src=172.18.2.10,dst=192.168.100.6,ttl=62)),
ct(zone=8),recirc(0x4)
After the first ct() action the packet headers are almost fully
re-written. The next ct() tries to re-use the existing nfct entry
and marks the packet as invalid, so it gets dropped later in the
pipeline.
Clearing the cached conntrack entry whenever packet tuple is changed
to avoid the issue.
The flow key should not be cleared though, because we should still
be able to match on the ct_state if the recirculation happens after
the tuple change but before the next ct() action.
Cc: stable(a)vger.kernel.org
Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action")
Reported-by: Frode Nordahl <frode.nordahl(a)canonical.com>
Link: https://mail.openvswitch.org/pipermail/ovs-discuss/2022-May/051829.html
Link: https://bugs.launchpad.net/ubuntu/+source/ovn/+bug/1967856
Signed-off-by: Ilya Maximets <i.maximets(a)ovn.org>
Link: https://lore.kernel.org/r/20220606221140.488984-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
The patch was already backported down to 5.15.
This version was adjusted to work on 5.10, 5.4 and 4.19.
net/openvswitch/actions.c | 6 ++++++
net/openvswitch/conntrack.c | 3 ++-
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 6d8d70021666..80fee9d118ee 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -372,6 +372,7 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
update_ip_l4_checksum(skb, nh, *addr, new_addr);
csum_replace4(&nh->check, *addr, new_addr);
skb_clear_hash(skb);
+ ovs_ct_clear(skb, NULL);
*addr = new_addr;
}
@@ -419,6 +420,7 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
update_ipv6_checksum(skb, l4_proto, addr, new_addr);
skb_clear_hash(skb);
+ ovs_ct_clear(skb, NULL);
memcpy(addr, new_addr, sizeof(__be32[4]));
}
@@ -659,6 +661,7 @@ static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key,
static void set_tp_port(struct sk_buff *skb, __be16 *port,
__be16 new_port, __sum16 *check)
{
+ ovs_ct_clear(skb, NULL);
inet_proto_csum_replace2(check, skb, *port, new_port, false);
*port = new_port;
}
@@ -698,6 +701,7 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
uh->dest = dst;
flow_key->tp.src = src;
flow_key->tp.dst = dst;
+ ovs_ct_clear(skb, NULL);
}
skb_clear_hash(skb);
@@ -760,6 +764,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
skb_clear_hash(skb);
+ ovs_ct_clear(skb, NULL);
+
flow_key->tp.src = sh->source;
flow_key->tp.dst = sh->dest;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 7ff98d39ec94..41f248895a87 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1324,7 +1324,8 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
if (skb_nfct(skb)) {
nf_conntrack_put(skb_nfct(skb));
nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
- ovs_ct_fill_key(skb, key);
+ if (key)
+ ovs_ct_fill_key(skb, key);
}
return 0;
--
2.34.3
[ Upstream commit 4a37f3dd9a83186cb88d44808ab35b78375082c9 ]
The original x86 sev_alloc() only called set_memory_decrypted() on
memory returned by alloc_pages_node(), so the page order calculation
fell out of that logic. However, the common dma-direct code has several
potential allocators, not all of which are guaranteed to round up the
underlying allocation to a power-of-two size, so carrying over that
calculation for the encryption/decryption size was a mistake. Fix it by
rounding to a *number* of pages, rather than an order.
Until recently there was an even worse interaction with DMA_DIRECT_REMAP
where we could have ended up decrypting part of the next adjacent
vmalloc area, only averted by no architecture actually supporting both
configs at once. Don't ask how I found that one out...
Fixes: c10f07aa27da ("dma/direct: Handle force decryption for DMA coherent buffers in common code")
Signed-off-by: Robin Murphy <robin.murphy(a)arm.com>
Signed-off-by: Christoph Hellwig <hch(a)lst.de>
Acked-by: David Rientjes <rientjes(a)google.com>
[ backport the functional change without all the prior refactoring ]
Signed-off-by: Robin Murphy <robin.murphy(a)arm.com>
---
Hi Greg, Sasha,
I see you managed to resolve this back as far as 5.15 already, so please
consider this backport to complete the set. This may need to end up in
the Android 5.10 kernel in future for unpleasant reasons, but as an
upstream fix I figure it may as well take the upstream stable route too.
Thanks,
Robin.
kernel/dma/direct.c | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8ca84610d4d4..944fdadb5a64 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -191,7 +191,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
goto out_free_pages;
if (force_dma_unencrypted(dev)) {
err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
+ PFN_UP(size));
if (err)
goto out_free_pages;
}
@@ -213,7 +213,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
ret = page_address(page);
if (force_dma_unencrypted(dev)) {
err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
+ PFN_UP(size));
if (err)
goto out_free_pages;
}
@@ -234,7 +234,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
out_encrypt_pages:
if (force_dma_unencrypted(dev)) {
err = set_memory_encrypted((unsigned long)page_address(page),
- 1 << get_order(size));
+ PFN_UP(size));
/* If memory cannot be re-encrypted, it must be leaked */
if (err)
return NULL;
@@ -247,8 +247,6 @@ void *dma_direct_alloc(struct device *dev, size_t size,
void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
- unsigned int page_order = get_order(size);
-
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
@@ -269,7 +267,7 @@ void dma_direct_free(struct device *dev, size_t size,
return;
if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
+ set_memory_encrypted((unsigned long)cpu_addr, PFN_UP(size));
if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
vunmap(cpu_addr);
@@ -305,8 +303,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
ret = page_address(page);
if (force_dma_unencrypted(dev)) {
- if (set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size)))
+ if (set_memory_decrypted((unsigned long)ret, PFN_UP(size)))
goto out_free_pages;
}
memset(ret, 0, size);
@@ -322,7 +319,6 @@ void dma_direct_free_pages(struct device *dev, size_t size,
struct page *page, dma_addr_t dma_addr,
enum dma_data_direction dir)
{
- unsigned int page_order = get_order(size);
void *vaddr = page_address(page);
/* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
@@ -331,7 +327,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
return;
if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
+ set_memory_encrypted((unsigned long)vaddr, PFN_UP(size));
dma_free_contiguous(dev, page, size);
}
--
2.36.1.dirty
Commit 787af64d05cd ("mm: page_alloc: validate buddy before check its migratetype.")
fixes a bug in 1dd214b8f21c and there is a similar bug in d9dddbf55667 that
can be fixed in a similar way too.
In addition, for RISC-V arch the first 2MB RAM could be reserved for opensbi,
so it would have pfn_base=512 and mem_map began with 512th PFN when
CONFIG_FLATMEM=y.
But __find_buddy_pfn algorithm thinks the start pfn 0, it could get 0 pfn or
less than the pfn_base value. We need page_is_buddy() to verify the buddy to
prevent accessing an invalid buddy.
Fixes: d9dddbf55667 ("mm/page_alloc: prevent merging between isolated and other pageblocks")
Cc: stable(a)vger.kernel.org
Reported-by: zjb194813(a)alibaba-inc.com
Reported-by: tianhu.hh(a)alibaba-inc.com
Signed-off-by: Xianting Tian <xianting.tian(a)linux.alibaba.com>
---
mm/page_alloc.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6e682569e5b..1c423faa4b62 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -864,6 +864,9 @@ static inline void __free_one_page(struct page *page,
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
+
+ if (!page_is_buddy(page, buddy, order))
+ goto done_merging;
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
--
2.17.1
Commit 787af64d05cd ("mm: page_alloc: validate buddy before check its migratetype.")
fixes a bug in 1dd214b8f21c and there is a similar bug in d9dddbf55667 that
can be fixed in a similar way too.
In addition, for RISC-V arch the first 2MB RAM could be reserved for opensbi,
so it would have pfn_base=512 and mem_map began with 512th PFN when
CONFIG_FLATMEM=y.
But __find_buddy_pfn algorithm thinks the start pfn 0, it could get 0 pfn or
less than the pfn_base value. We need page_is_buddy() to verify the buddy to
prevent accessing an invalid buddy.
Fixes: d9dddbf55667 ("mm/page_alloc: prevent merging between isolated and other pageblocks")
Cc: stable(a)vger.kernel.org
Reported-by: zjb194813(a)alibaba-inc.com
Reported-by: tianhu.hh(a)alibaba-inc.com
Signed-off-by: Xianting Tian <xianting.tian(a)linux.alibaba.com>
---
mm/page_alloc.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6e682569e5b..1c423faa4b62 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -864,6 +864,9 @@ static inline void __free_one_page(struct page *page,
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
+
+ if (!page_is_buddy(page, buddy, order))
+ goto done_merging;
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
--
2.17.1