We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com --- mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d3e66136e41a3..49b98082c5401 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1516,10 +1516,9 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) }
static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, bool write) + pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; - pgprot_t prot = vma->vm_page_prot; pud_t entry;
if (!pud_none(*pud)) { @@ -1581,7 +1580,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
ptl = pud_lock(vma->vm_mm, vmf->pud); - insert_pfn_pud(vma, addr, vmf->pud, pfn, write); + insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); spin_unlock(ptl);
return VM_FAULT_NOPAGE; @@ -1625,7 +1624,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); } insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)), - write); + vma->vm_page_prot, write); spin_unlock(ptl);
return VM_FAULT_NOPAGE;
On Wed, Jun 11, 2025 at 02:06:52PM +0200, David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com
mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d3e66136e41a3..49b98082c5401 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1516,10 +1516,9 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) } static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, pfn_t pfn, bool write)
pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
{ struct mm_struct *mm = vma->vm_mm;
- pgprot_t prot = vma->vm_page_prot; pud_t entry;
if (!pud_none(*pud)) { @@ -1581,7 +1580,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); spin_unlock(ptl);
return VM_FAULT_NOPAGE; @@ -1625,7 +1624,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); } insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
write);
vma->vm_page_prot, write);
Actually It's not immediately obvious to me why we don't call track_pfn_insert() and forward the pgprot here as well. Prior to me adding vmf_insert_folio_pud() device DAX would call vmf_insert_pfn_pud(), and the intent at least seems to have been to change pgprot for that (and we did for the PTE/PMD versions).
However now that the ZONE_DEVICE folios are refcounted normally I switched device dax to using vmf_insert_folio_*() which never changes pgprot based on x86 PAT. So I think we probably need to either add that to vmf_insert_folio_*() or a new variant or make it the responsibility of callers to figure out the correct pgprot.
spin_unlock(ptl); return VM_FAULT_NOPAGE; -- 2.49.0
On 12.06.25 03:56, Alistair Popple wrote:
On Wed, Jun 11, 2025 at 02:06:52PM +0200, David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com
mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d3e66136e41a3..49b98082c5401 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1516,10 +1516,9 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) } static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, pfn_t pfn, bool write)
{ struct mm_struct *mm = vma->vm_mm;pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
- pgprot_t prot = vma->vm_page_prot; pud_t entry;
if (!pud_none(*pud)) { @@ -1581,7 +1580,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot); ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); spin_unlock(ptl);
return VM_FAULT_NOPAGE; @@ -1625,7 +1624,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); } insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
write);
vma->vm_page_prot, write);
Actually It's not immediately obvious to me why we don't call track_pfn_insert() and forward the pgprot here as well.
(track_pfn_insert is now called pfnmap_setup_cachemode_pfn)
Prior to me adding vmf_insert_folio_pud()
device DAX would call vmf_insert_pfn_pud(), and the intent at least seems to have been to change pgprot for that (and we did for the PTE/PMD versions).
It's only for PFNMAP mappings as far as I understand. I think this is mostly about drivers mapping actual weird stuff with weird memory types (e.g., vfio mapping mmio etc) into the page tables, that does not have a struct page.
However now that the ZONE_DEVICE folios are refcounted normally I switched device dax to using vmf_insert_folio_*() which never changes pgprot based on x86 PAT. So I think we probably need to either add that to vmf_insert_folio_*() or a new variant or make it the responsibility of callers to figure out the correct pgprot.
I would assume that for ZONE_DEVICE the cachemode is always simpler (e.g., no MMIO?)?
In any case, I would assume ZONE_DEVICE only ended up "accidentally" triggering it and that it didn't make a difference.
Observe that pfnmap_setup_cachemode_pfn() is only called from vmf_insert_pfn_*() ... well, and our ugly friend __vm_insert_mixed() that similarly inserts a PFN mapping.
David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
This is only a problem if the kernel mapped the pud in advance of userspace mapping it, right?
The change looks good.
Reviewed-by: Dan Williams dan.j.williams@intel.com
...but I am struggling with the scenario where this causes problems in practice, where vm_page_prot is the wrong cachemode.
On 12.06.25 06:34, Dan Williams wrote:
David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
This is only a problem if the kernel mapped the pud in advance of userspace mapping it, right?
Good question, PAT code is confusing.
What I understood is that drivers like vfio will register the range with the expected cachemode, and then rely on vm_insert_* to fill out the cachemode for them.
Peter explained it in the dicussion here [1] how e.g., vfio triggers that early registration.
Regarding vfio, I can see that we do in vfio_pci_core_mmap() unconditionally:
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
and probably rely on us querying the actual cachemode to be used later.
vfio can map all kinds of different memory types ...
[1] https://lkml.kernel.org/r/aBDXr-Qp4z0tS50P@x1.local
The change looks good.
Reviewed-by: Dan Williams dan.j.williams@intel.com
...but I am struggling with the scenario where this causes problems in practice, where vm_page_prot is the wrong cachemode.
Yeah, it's all confusing.
But as long as we don't conclude that pfnmap_setup_cachemode_pfn() can be removed entirely (esp. also from pte / pmd case), this seems to be the right thing to do and was accidental change in the introducing commit.
Is it actually stable material? I don't know, but possibly getting cachemodes wrongs sounds ... bad?
On Wed, Jun 11, 2025 at 02:06:52PM +0200, David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com
Nice catch!
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d3e66136e41a3..49b98082c5401 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1516,10 +1516,9 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) }
static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, pfn_t pfn, bool write)
pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
{ struct mm_struct *mm = vma->vm_mm;
pgprot_t prot = vma->vm_page_prot; pud_t entry;
if (!pud_none(*pud)) {
@@ -1581,7 +1580,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) pfnmap_setup_cachemode_pfn(pfn_t_to_pfn(pfn), &pgprot);
ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); spin_unlock(ptl);
return VM_FAULT_NOPAGE;
@@ -1625,7 +1624,7 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR); } insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
write);
vma->vm_page_prot, write);
spin_unlock(ptl);
return VM_FAULT_NOPAGE;
-- 2.49.0
On 12.06.25 17:28, Lorenzo Stoakes wrote:
On Wed, Jun 11, 2025 at 02:06:52PM +0200, David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com
Nice catch!
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Thanks! What's your opinion on stable? Really hard to judge the impact ...
On Thu, Jun 12, 2025 at 05:36:35PM +0200, David Hildenbrand wrote:
On 12.06.25 17:28, Lorenzo Stoakes wrote:
On Wed, Jun 11, 2025 at 02:06:52PM +0200, David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries")
Ha! I don't even remember doing that patch... hm did I introduce this -ignoring cache- thing? Sorry! :P
Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com
Nice catch!
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Thanks! What's your opinion on stable? Really hard to judge the impact ...
I think it makes sense? This is currently incorrect so let's do the right thing and backport.
I think as per Dan it's probably difficult to picture this causing a problem, but on principle I think this is correct, and I don't see any harm in backporting?
-- Cheers,
David / dhildenb
On 12.06.25 17:59, Lorenzo Stoakes wrote:
On Thu, Jun 12, 2025 at 05:36:35PM +0200, David Hildenbrand wrote:
On 12.06.25 17:28, Lorenzo Stoakes wrote:
On Wed, Jun 11, 2025 at 02:06:52PM +0200, David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries")
Ha! I don't even remember doing that patch... hm did I introduce this -ignoring cache- thing? Sorry! :P
:)
Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com
Nice catch!
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Thanks! What's your opinion on stable? Really hard to judge the impact ...
I think it makes sense? This is currently incorrect so let's do the right thing and backport.
I think as per Dan it's probably difficult to picture this causing a problem, but on principle I think this is correct, and I don't see any harm in backporting?
Same opinion, thanks!
On Wed, Jun 11, 2025 at 02:06:52PM +0200, David Hildenbrand wrote:
We setup the cache mode but ... don't forward the updated pgprot to insert_pfn_pud().
Only a problem on x86-64 PAT when mapping PFNs using PUDs that require a special cachemode.
Fix it by using the proper pgprot where the cachemode was setup.
Identified by code inspection.
Fixes: 7b806d229ef1 ("mm: remove vmf_insert_pfn_xxx_prot() for huge page-table entries") Cc: stable@vger.kernel.org Signed-off-by: David Hildenbrand david@redhat.com
mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
Reviewed-by: Jason Gunthorpe jgg@nvidia.com
Jason
linux-stable-mirror@lists.linaro.org