Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Also, as pointed out by Andi Kleen, we need to make sure that entire ELF note is within a page bounds, so move the overflow check up and add an extra note_size boundaries validation.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
Cc: stable@vger.kernel.org Cc: Jann Horn jannh@google.com Suggested-by: Andi Kleen ak@linux.intel.com Fixes: bd7525dacd7e ("bpf: Move stack_map_get_build_id into lib") Signed-off-by: Andrii Nakryiko andrii@kernel.org --- lib/buildid.c | 74 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 31 deletions(-)
diff --git a/lib/buildid.c b/lib/buildid.c index e02b5507418b..61bc5b767064 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id, const void *note_start, Elf32_Word note_size) { - Elf32_Word note_offs = 0, new_offs; - - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + const char note_name[] = "GNU"; + const size_t note_name_sz = sizeof(note_name); + u64 note_off = 0, new_off, name_sz, desc_sz; + const char *data; + + while (note_off + sizeof(Elf32_Nhdr) < note_size && + note_off + sizeof(Elf32_Nhdr) > note_off /* overflow */) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_off); + + name_sz = READ_ONCE(nhdr->n_namesz); + desc_sz = READ_ONCE(nhdr->n_descsz); + + new_off = note_off + sizeof(Elf32_Nhdr); + if (check_add_overflow(new_off, ALIGN(name_sz, 4), &new_off) || + check_add_overflow(new_off, ALIGN(desc_sz, 4), &new_off) || + new_off > note_size) + break;
if (nhdr->n_type == BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - !strcmp((char *)(nhdr + 1), "GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BUILD_ID_SIZE_MAX - nhdr->n_descsz); + name_sz == note_name_sz && + strcmp((char *)(nhdr + 1), note_name) == 0 && + desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) { + data = note_start + note_off + ALIGN(note_name_sz, 4); + memcpy(build_id, data, desc_sz); + memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size) - *size = nhdr->n_descsz; + *size = desc_sz; return 0; } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; + + note_off = new_off; }
return -EINVAL; @@ -71,7 +77,7 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; Elf32_Phdr *phdr; - int i; + __u32 i, phnum;
/* * FIXME @@ -80,9 +86,10 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, */ if (ehdr->e_phoff != sizeof(Elf32_Ehdr)) return -EINVAL; + + phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + if (phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) return -EINVAL;
phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); @@ -90,8 +97,8 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + page_addr + READ_ONCE(phdr[i].p_offset), + READ_ONCE(phdr[i].p_filesz))) return 0; } return -EINVAL; @@ -103,7 +110,7 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; Elf64_Phdr *phdr; - int i; + __u32 i, phnum;
/* * FIXME @@ -112,18 +119,19 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, */ if (ehdr->e_phoff != sizeof(Elf64_Ehdr)) return -EINVAL; + + phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + if (phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) return -EINVAL;
phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
- for (i = 0; i < ehdr->e_phnum; ++i) { + for (i = 0; i < phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + page_addr + READ_ONCE(phdr[i].p_offset), + READ_ONCE(phdr[i].p_filesz))) return 0; } return -EINVAL; @@ -152,6 +160,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, page = find_get_page(vma->vm_file->f_mapping, 0); if (!page) return -EFAULT; /* page not mapped */ + if (!PageUptodate(page)) { + put_page(page); + return -EFAULT; + }
ret = -EINVAL; page_addr = kmap_local_page(page);
@@ -152,6 +160,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, page = find_get_page(vma->vm_file->f_mapping, 0); if (!page) return -EFAULT; /* page not mapped */
- if (!PageUptodate(page)) {
put_page(page);
return -EFAULT;
- }
That change is not described. As I understand it might prevent reading previous data in the page or maybe junk under an IO error? Anyways I guess it's a good change.
Reviewed-by: Andi Kleen ak@linux.intel.com
-Andi
On Mon, Aug 12, 2024 at 5:52 PM Andi Kleen ak@linux.intel.com wrote:
@@ -152,6 +160,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, page = find_get_page(vma->vm_file->f_mapping, 0); if (!page) return -EFAULT; /* page not mapped */
if (!PageUptodate(page)) {
put_page(page);
return -EFAULT;
}
That change is not described. As I understand it might prevent reading previous data in the page or maybe junk under an IO error? Anyways I guess it's a good change.
From what I understood, one can get a valid page from the find_get_page() (same for folio), but it might not be yet completely filled out. PageUptodate() is supposed to detect this and prevent the use of incomplete page data.
Reviewed-by: Andi Kleen ak@linux.intel.com
-Andi
On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko andrii@kernel.org wrote:
Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Also, as pointed out by Andi Kleen, we need to make sure that entire ELF note is within a page bounds, so move the overflow check up and add an extra note_size boundaries validation.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
Sorry, I missed some things in previous review rounds:
[...]
@@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
[...]
if (nhdr->n_type == BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
Please change this to something like "memcmp((char *)(nhdr + 1), note_name, note_name_sz) == 0" to ensure that we can't run off the end of the page if there are no null bytes in the rest of the page.
[...]
@@ -90,8 +97,8 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, for (i = 0; i < ehdr->e_phnum; ++i) {
Please change this to "for (i = 0; i < phnum; ++i) {" like in the 64-bit version.
With these two changes applied:
Reviewed-by: Jann Horn jannh@google.com
On Tue, Aug 13, 2024 at 1:59 PM Jann Horn jannh@google.com wrote:
On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko andrii@kernel.org wrote:
Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Also, as pointed out by Andi Kleen, we need to make sure that entire ELF note is within a page bounds, so move the overflow check up and add an extra note_size boundaries validation.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
Sorry, I missed some things in previous review rounds:
[...]
@@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
[...]
if (nhdr->n_type == BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
Please change this to something like "memcmp((char *)(nhdr + 1), note_name, note_name_sz) == 0" to ensure that we can't run off the end of the page if there are no null bytes in the rest of the page.
I did switch this to strncmp() at some earlier point, but then realized that there is no point because note_name is controlled by us and will ensure there is a zero at byte (note_name_sz - 1). So I don't think memcmp() buys us anything.
[...]
@@ -90,8 +97,8 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, for (i = 0; i < ehdr->e_phnum; ++i) {
Please change this to "for (i = 0; i < phnum; ++i) {" like in the 64-bit version.
This did slip through, yep. I'll check with BPF maintainers if this can be fixed up while applying or whether I should send another revision.
With these two changes applied:
Reviewed-by: Jann Horn jannh@google.com
On Wed, Aug 14, 2024 at 1:21 AM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
On Tue, Aug 13, 2024 at 1:59 PM Jann Horn jannh@google.com wrote:
On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko andrii@kernel.org wrote:
Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Also, as pointed out by Andi Kleen, we need to make sure that entire ELF note is within a page bounds, so move the overflow check up and add an extra note_size boundaries validation.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
Sorry, I missed some things in previous review rounds:
[...]
@@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
[...]
if (nhdr->n_type == BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
Please change this to something like "memcmp((char *)(nhdr + 1), note_name, note_name_sz) == 0" to ensure that we can't run off the end of the page if there are no null bytes in the rest of the page.
I did switch this to strncmp() at some earlier point, but then realized that there is no point because note_name is controlled by us and will ensure there is a zero at byte (note_name_sz - 1). So I don't think memcmp() buys us anything.
There are two reasons why using strcmp() here makes me uneasy.
First: We're still operating on shared memory that can concurrently change.
Let's say strcmp is implemented like this, this is the generic C implementation in the kernel (which I think is the implementation that's used for x86-64):
int strcmp(const char *cs, const char *ct) { unsigned char c1, c2;
while (1) { c1 = *cs++; c2 = *ct++; if (c1 != c2) return c1 < c2 ? -1 : 1; if (!c1) break; } return 0; }
No READ_ONCE() or anything like that - it's not designed for being used on concurrently changing memory.
And let's say you call it like strcmp(<shared memory>, "GNU"), and we're now in the fourth iteration. If the compiler decides to re-fetch the value of "c1" from memory for each of the two conditions, then it could be that the "if (c1 != c2)" sees c1='\0' and c2='\0', so the condition evaluates as false; but then at the "if (!c1)", the value in memory changed, and we see c1='A'. So now in the next round, we'll be accessing out-of-bounds memory behind the 4-byte string constant "GNU".
So I don't think strcmp() on memory that can concurrently change is allowed.
(It actually seems like the generic memcmp() is also implemented without READ_ONCE(), maybe we should change that...)
Second: You are assuming that if one side of the strcmp() is at most four bytes long (including null terminator), then strcmp() also won't access more than 4 bytes of the other string, even if that string does not have a null terminator at index 4. I don't think that's part of the normal strcmp() API contract.
On Wed, Aug 14, 2024 at 9:14 AM Jann Horn jannh@google.com wrote:
On Wed, Aug 14, 2024 at 1:21 AM Andrii Nakryiko andrii.nakryiko@gmail.com wrote:
On Tue, Aug 13, 2024 at 1:59 PM Jann Horn jannh@google.com wrote:
On Tue, Aug 13, 2024 at 2:29 AM Andrii Nakryiko andrii@kernel.org wrote:
Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once.
Also, as pointed out by Andi Kleen, we need to make sure that entire ELF note is within a page bounds, so move the overflow check up and add an extra note_size boundaries validation.
Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+.
Sorry, I missed some things in previous review rounds:
[...]
@@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id,
[...]
if (nhdr->n_type == BUILD_ID &&
nhdr->n_namesz == sizeof("GNU") &&
!strcmp((char *)(nhdr + 1), "GNU") &&
nhdr->n_descsz > 0 &&
nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
memcpy(build_id,
note_start + note_offs +
ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
nhdr->n_descsz);
memset(build_id + nhdr->n_descsz, 0,
BUILD_ID_SIZE_MAX - nhdr->n_descsz);
name_sz == note_name_sz &&
strcmp((char *)(nhdr + 1), note_name) == 0 &&
Please change this to something like "memcmp((char *)(nhdr + 1), note_name, note_name_sz) == 0" to ensure that we can't run off the end of the page if there are no null bytes in the rest of the page.
I did switch this to strncmp() at some earlier point, but then realized that there is no point because note_name is controlled by us and will ensure there is a zero at byte (note_name_sz - 1). So I don't think memcmp() buys us anything.
There are two reasons why using strcmp() here makes me uneasy.
First: We're still operating on shared memory that can concurrently change.
Let's say strcmp is implemented like this, this is the generic C implementation in the kernel (which I think is the implementation that's used for x86-64):
int strcmp(const char *cs, const char *ct) { unsigned char c1, c2;
while (1) { c1 = *cs++; c2 = *ct++; if (c1 != c2) return c1 < c2 ? -1 : 1; if (!c1) break; } return 0;
}
No READ_ONCE() or anything like that - it's not designed for being used on concurrently changing memory.
And let's say you call it like strcmp(<shared memory>, "GNU"), and we're now in the fourth iteration. If the compiler decides to re-fetch the value of "c1" from memory for each of the two conditions, then it could be that the "if (c1 != c2)" sees c1='\0' and c2='\0', so the condition evaluates as false; but then at the "if (!c1)", the value in memory changed, and we see c1='A'. So now in the next round, we'll be accessing out-of-bounds memory behind the 4-byte string constant "GNU".
So I don't think strcmp() on memory that can concurrently change is allowed.
(It actually seems like the generic memcmp() is also implemented without READ_ONCE(), maybe we should change that...)
Second: You are assuming that if one side of the strcmp() is at most four bytes long (including null terminator), then strcmp() also won't access more than 4 bytes of the other string, even if that string does not have a null terminator at index 4. I don't think that's part of the normal strcmp() API contract.
Ok, I'm convinced, all fair points. I'll switch to memcmp(), there is no downside to that anyways.
linux-stable-mirror@lists.linaro.org