The patch titled
Subject: mm, devm_memremap_pages: fix shutdown handling
has been removed from the -mm tree. Its filename was
mm-devm_memremap_pages-fix-shutdown-handling.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Dan Williams <dan.j.williams(a)intel.com>
Subject: mm, devm_memremap_pages: fix shutdown handling
The last step before devm_memremap_pages() returns success is to allocate
a release action, devm_memremap_pages_release(), to tear the entire setup
down. However, the result from devm_add_action() is not checked.
Checking the error from devm_add_action() is not enough. The api
currently relies on the fact that the percpu_ref it is using is killed by
the time the devm_memremap_pages_release() is run. Rather than continue
this awkward situation, offload the responsibility of killing the
percpu_ref to devm_memremap_pages_release() directly. This allows
devm_memremap_pages() to do the right thing relative to init failures and
shutdown.
Without this change we could fail to register the teardown of
devm_memremap_pages(). The likelihood of hitting this failure is tiny as
small memory allocations almost always succeed. However, the impact of
the failure is large given any future reconfiguration, or disable/enable,
of an nvdimm namespace will fail forever as subsequent calls to
devm_memremap_pages() will fail to setup the pgmap_radix since there will
be stale entries for the physical address range.
An argument could be made to require that the ->kill() operation be set in
the @pgmap arg rather than passed in separately. However, it helps code
readability, tracking the lifetime of a given instance, to be able to grep
the kill routine directly at the devm_memremap_pages() call site.
Link: http://lkml.kernel.org/r/154275558526.76910.7535251937849268605.stgit@dwill…
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Fixes: e8d513483300 ("memremap: change devm_memremap_pages interface...")
Reviewed-by: "Jérôme Glisse" <jglisse(a)redhat.com>
Reported-by: Logan Gunthorpe <logang(a)deltatee.com>
Reviewed-by: Logan Gunthorpe <logang(a)deltatee.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Cc: Balbir Singh <bsingharora(a)gmail.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/drivers/dax/pmem.c~mm-devm_memremap_pages-fix-shutdown-handling
+++ a/drivers/dax/pmem.c
@@ -48,9 +48,8 @@ static void dax_pmem_percpu_exit(void *d
percpu_ref_exit(ref);
}
-static void dax_pmem_percpu_kill(void *data)
+static void dax_pmem_percpu_kill(struct percpu_ref *ref)
{
- struct percpu_ref *ref = data;
struct dax_pmem *dax_pmem = to_dax_pmem(ref);
dev_dbg(dax_pmem->dev, "trace\n");
@@ -112,17 +111,10 @@ static int dax_pmem_probe(struct device
}
dax_pmem->pgmap.ref = &dax_pmem->ref;
+ dax_pmem->pgmap.kill = dax_pmem_percpu_kill;
addr = devm_memremap_pages(dev, &dax_pmem->pgmap);
- if (IS_ERR(addr)) {
- devm_remove_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
- percpu_ref_exit(&dax_pmem->ref);
+ if (IS_ERR(addr))
return PTR_ERR(addr);
- }
-
- rc = devm_add_action_or_reset(dev, dax_pmem_percpu_kill,
- &dax_pmem->ref);
- if (rc)
- return rc;
/* adjust the dax_region resource to the start of data */
memcpy(&res, &dax_pmem->pgmap.res, sizeof(res));
--- a/drivers/nvdimm/pmem.c~mm-devm_memremap_pages-fix-shutdown-handling
+++ a/drivers/nvdimm/pmem.c
@@ -309,8 +309,11 @@ static void pmem_release_queue(void *q)
blk_cleanup_queue(q);
}
-static void pmem_freeze_queue(void *q)
+static void pmem_freeze_queue(struct percpu_ref *ref)
{
+ struct request_queue *q;
+
+ q = container_of(ref, typeof(*q), q_usage_counter);
blk_freeze_queue_start(q);
}
@@ -402,6 +405,7 @@ static int pmem_attach_disk(struct devic
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter;
+ pmem->pgmap.kill = pmem_freeze_queue;
if (is_nd_pfn(dev)) {
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
@@ -427,13 +431,6 @@ static int pmem_attach_disk(struct devic
memcpy(&bb_res, &nsio->res, sizeof(bb_res));
}
- /*
- * At release time the queue must be frozen before
- * devm_memremap_pages is unwound
- */
- if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
- return -ENOMEM;
-
if (IS_ERR(addr))
return PTR_ERR(addr);
pmem->virt_addr = addr;
--- a/include/linux/memremap.h~mm-devm_memremap_pages-fix-shutdown-handling
+++ a/include/linux/memremap.h
@@ -111,6 +111,7 @@ typedef void (*dev_page_free_t)(struct p
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @res: physical address range covered by @ref
* @ref: reference count that pins the devm_memremap_pages() mapping
+ * @kill: callback to transition @ref to the dead state
* @dev: host device of the mapping for debug
* @data: private data pointer for page_free()
* @type: memory type: see MEMORY_* in memory_hotplug.h
@@ -122,6 +123,7 @@ struct dev_pagemap {
bool altmap_valid;
struct resource res;
struct percpu_ref *ref;
+ void (*kill)(struct percpu_ref *ref);
struct device *dev;
void *data;
enum memory_type type;
--- a/kernel/memremap.c~mm-devm_memremap_pages-fix-shutdown-handling
+++ a/kernel/memremap.c
@@ -88,14 +88,10 @@ static void devm_memremap_pages_release(
resource_size_t align_start, align_size;
unsigned long pfn;
+ pgmap->kill(pgmap->ref);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
- if (percpu_ref_tryget_live(pgmap->ref)) {
- dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
- percpu_ref_put(pgmap->ref);
- }
-
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -116,7 +112,7 @@ static void devm_memremap_pages_release(
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res
- * @pgmap: pointer to a struct dev_pgmap
+ * @pgmap: pointer to a struct dev_pagemap
*
* Notes:
* 1/ At a minimum the res, ref and type members of @pgmap must be initialized
@@ -125,11 +121,8 @@ static void devm_memremap_pages_release(
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
* must be set to true
*
- * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages()
- * time (or devm release event). The expected order of events is that ref has
- * been through percpu_ref_kill() before devm_memremap_pages_release(). The
- * wait for the completion of all references being dropped and
- * percpu_ref_exit() must occur after devm_memremap_pages_release().
+ * 3/ pgmap->ref must be 'live' on entry and will be killed at
+ * devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -145,6 +138,9 @@ void *devm_memremap_pages(struct device
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
+ if (!pgmap->ref || !pgmap->kill)
+ return ERR_PTR(-EINVAL);
+
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- align_start;
@@ -170,12 +166,10 @@ void *devm_memremap_pages(struct device
if (is_ram != REGION_DISJOINT) {
WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
is_ram == REGION_MIXED ? "mixed" : "ram", res);
- return ERR_PTR(-ENXIO);
+ error = -ENXIO;
+ goto err_array;
}
- if (!pgmap->ref)
- return ERR_PTR(-EINVAL);
-
pgmap->dev = dev;
error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
@@ -217,7 +211,10 @@ void *devm_memremap_pages(struct device
align_size >> PAGE_SHIFT, pgmap);
percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
- devm_add_action(dev, devm_memremap_pages_release, pgmap);
+ error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
+ pgmap);
+ if (error)
+ return ERR_PTR(error);
return __va(res->start);
@@ -228,6 +225,7 @@ void *devm_memremap_pages(struct device
err_pfn_remap:
pgmap_array_delete(res);
err_array:
+ pgmap->kill(pgmap->ref);
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(devm_memremap_pages);
--- a/tools/testing/nvdimm/test/iomap.c~mm-devm_memremap_pages-fix-shutdown-handling
+++ a/tools/testing/nvdimm/test/iomap.c
@@ -104,13 +104,26 @@ void *__wrap_devm_memremap(struct device
}
EXPORT_SYMBOL(__wrap_devm_memremap);
+static void nfit_test_kill(void *_pgmap)
+{
+ struct dev_pagemap *pgmap = _pgmap;
+
+ pgmap->kill(pgmap->ref);
+}
+
void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{
resource_size_t offset = pgmap->res.start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
- if (nfit_res)
+ if (nfit_res) {
+ int rc;
+
+ rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
+ if (rc)
+ return ERR_PTR(rc);
return nfit_res->buf + offset - nfit_res->res.start;
+ }
return devm_memremap_pages(dev, pgmap);
}
EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
_
Patches currently in -mm which might be from dan.j.williams(a)intel.com are
The patch titled
Subject: mm, devm_memremap_pages: kill mapping "System RAM" support
has been removed from the -mm tree. Its filename was
mm-devm_memremap_pages-kill-mapping-system-ram-support.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Dan Williams <dan.j.williams(a)intel.com>
Subject: mm, devm_memremap_pages: kill mapping "System RAM" support
Given the fact that devm_memremap_pages() requires a percpu_ref that is
torn down by devm_memremap_pages_release() the current support for mapping
RAM is broken.
Support for remapping "System RAM" has been broken since the beginning and
there is no existing user of this this code path, so just kill the support
and make it an explicit error.
This cleanup also simplifies a follow-on patch to fix the error path when
setting a devm release action for devm_memremap_pages_release() fails.
Link: http://lkml.kernel.org/r/154275557997.76910.14689813630968180480.stgit@dwil…
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: "Jérôme Glisse" <jglisse(a)redhat.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Logan Gunthorpe <logang(a)deltatee.com>
Cc: Balbir Singh <bsingharora(a)gmail.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/kernel/memremap.c~mm-devm_memremap_pages-kill-mapping-system-ram-support
+++ a/kernel/memremap.c
@@ -167,15 +167,12 @@ void *devm_memremap_pages(struct device
is_ram = region_intersects(align_start, align_size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
- if (is_ram == REGION_MIXED) {
- WARN_ONCE(1, "%s attempted on mixed region %pr\n",
- __func__, res);
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+ is_ram == REGION_MIXED ? "mixed" : "ram", res);
return ERR_PTR(-ENXIO);
}
- if (is_ram == REGION_INTERSECTS)
- return __va(res->start);
-
if (!pgmap->ref)
return ERR_PTR(-EINVAL);
_
Patches currently in -mm which might be from dan.j.williams(a)intel.com are
The patch titled
Subject: mm, devm_memremap_pages: mark devm_memremap_pages() EXPORT_SYMBOL_GPL
has been removed from the -mm tree. Its filename was
mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Dan Williams <dan.j.williams(a)intel.com>
Subject: mm, devm_memremap_pages: mark devm_memremap_pages() EXPORT_SYMBOL_GPL
devm_memremap_pages() is a facility that can create struct page entries
for any arbitrary range and give drivers the ability to subvert core
aspects of page management.
Specifically the facility is tightly integrated with the kernel's memory
hotplug functionality. It injects an altmap argument deep into the
architecture specific vmemmap implementation to allow allocating from
specific reserved pages, and it has Linux specific assumptions about page
structure reference counting relative to get_user_pages() and
get_user_pages_fast(). It was an oversight and a mistake that this was
not marked EXPORT_SYMBOL_GPL from the outset.
Again, devm_memremap_pagex() exposes and relies upon core kernel internal
assumptions and will continue to evolve along with 'struct page', memory
hotplug, and support for new memory types / topologies. Only an in-kernel
GPL-only driver is expected to keep up with this ongoing evolution. This
interface, and functionality derived from this interface, is not suitable
for kernel-external drivers.
Link: http://lkml.kernel.org/r/154275557457.76910.16923571232582744134.stgit@dwil…
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: "Jérôme Glisse" <jglisse(a)redhat.com>
Cc: Balbir Singh <bsingharora(a)gmail.com>
Cc: Logan Gunthorpe <logang(a)deltatee.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/kernel/memremap.c~mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl
+++ a/kernel/memremap.c
@@ -233,7 +233,7 @@ void *devm_memremap_pages(struct device
err_array:
return ERR_PTR(error);
}
-EXPORT_SYMBOL(devm_memremap_pages);
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
--- a/tools/testing/nvdimm/test/iomap.c~mm-devm_memremap_pages-mark-devm_memremap_pages-export_symbol_gpl
+++ a/tools/testing/nvdimm/test/iomap.c
@@ -113,7 +113,7 @@ void *__wrap_devm_memremap_pages(struct
return nfit_res->buf + offset - nfit_res->res.start;
return devm_memremap_pages(dev, pgmap);
}
-EXPORT_SYMBOL(__wrap_devm_memremap_pages);
+EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
pfn_t __wrap_phys_to_pfn_t(phys_addr_t addr, unsigned long flags)
{
_
Patches currently in -mm which might be from dan.j.williams(a)intel.com are
From: Dan Carpenter <dan.carpenter(a)oracle.com>
[ Upstream commit 1376b0a2160319125c3a2822e8c09bd283cd8141 ]
There is a '>' vs '<' typo so this loop is a no-op.
Fixes: d35dcc89fc93 ("staging: comedi: quatech_daqp_cs: fix daqp_ao_insn_write()")
Signed-off-by: Dan Carpenter <dan.carpenter(a)oracle.com>
Reviewed-by: Ian Abbott <abbotti(a)mev.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/staging/comedi/drivers/quatech_daqp_cs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/staging/comedi/drivers/quatech_daqp_cs.c b/drivers/staging/comedi/drivers/quatech_daqp_cs.c
index b3bbec0a0d23..f89a863ea04c 100644
--- a/drivers/staging/comedi/drivers/quatech_daqp_cs.c
+++ b/drivers/staging/comedi/drivers/quatech_daqp_cs.c
@@ -649,7 +649,7 @@ static int daqp_ao_insn_write(struct comedi_device *dev,
/* Make sure D/A update mode is direct update */
outb(0, dev->iobase + DAQP_AUX);
- for (i = 0; i > insn->n; i++) {
+ for (i = 0; i < insn->n; i++) {
val = data[0];
val &= 0x0fff;
val ^= 0x0800; /* Flip the sign */
--
2.18.0
While looking at BUGs associated with invalid huge page map counts,
it was discovered and observed that a huge pte pointer could become
'invalid' and point to another task's page table. Consider the
following:
A task takes a page fault on a shared hugetlbfs file and calls
huge_pte_alloc to get a ptep. Suppose the returned ptep points to a
shared pmd.
Now, another task truncates the hugetlbfs file. As part of truncation,
it unmaps everyone who has the file mapped. If the range being
truncated is covered by a shared pmd, huge_pmd_unshare will be called.
For all but the last user of the shared pmd, huge_pmd_unshare will
clear the pud pointing to the pmd. If the task in the middle of the
page fault is not the last user, the ptep returned by huge_pte_alloc
now points to another task's page table or worse. This leads to bad
things such as incorrect page map/reference counts or invalid memory
references.
To fix, expand the use of i_mmap_rwsem as follows:
- i_mmap_rwsem is held in read mode whenever huge_pmd_share is called.
huge_pmd_share is only called via huge_pte_alloc, so callers of
huge_pte_alloc take i_mmap_rwsem before calling. In addition, callers
of huge_pte_alloc continue to hold the semaphore until finished with
the ptep.
- i_mmap_rwsem is held in write mode whenever huge_pmd_unshare is called.
Cc: <stable(a)vger.kernel.org>
Fixes: 39dde65c9940 ("shared page table for hugetlb page")
Signed-off-by: Mike Kravetz <mike.kravetz(a)oracle.com>
---
mm/hugetlb.c | 67 ++++++++++++++++++++++++++++++++++-----------
mm/memory-failure.c | 14 +++++++++-
mm/migrate.c | 13 ++++++++-
mm/rmap.c | 4 +++
mm/userfaultfd.c | 11 ++++++--
5 files changed, 89 insertions(+), 20 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 309fb8c969af..2a3162030167 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3239,6 +3239,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ struct address_space *mapping = vma->vm_file->f_mapping;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
int ret = 0;
@@ -3247,14 +3248,25 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
mmun_start = vma->vm_start;
mmun_end = vma->vm_end;
- if (cow)
+ if (cow) {
mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+ } else {
+ /*
+ * For shared mappings i_mmap_rwsem must be held to call
+ * huge_pte_alloc, otherwise the returned ptep could go
+ * away if part of a shared pmd and another thread calls
+ * huge_pmd_unshare.
+ */
+ i_mmap_lock_read(mapping);
+ }
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
+
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
+
dst_pte = huge_pte_alloc(dst, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
@@ -3325,6 +3337,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (cow)
mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+ else
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -3772,14 +3786,18 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
};
/*
- * hugetlb_fault_mutex must be dropped before
- * handling userfault. Reacquire after handling
- * fault to make calling code simpler.
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+
+ i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -3927,6 +3945,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -3934,20 +3957,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
+ /*
+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something changed.
+ */
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
+ i_mmap_lock_read(mapping);
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep) {
+ i_mmap_unlock_read(mapping);
+ return VM_FAULT_OOM;
+ }
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
+ idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4035,6 +4069,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4639,10 +4674,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode.
+ * For hugetlbfs, this prevents removal of any page table entries associated
+ * with the address space. This is important as we are setting up sharing
+ * based on existing page table entries (mappings).
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4659,7 +4696,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_lock_write(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4689,7 +4725,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_unlock_write(mapping);
return pte;
}
@@ -4700,7 +4735,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0cd3de3550f0..b992d1295578 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_success = try_to_unmap(hpage, ttu);
+ if (!PageHuge(hpage)) {
+ unmap_success = try_to_unmap(hpage, ttu);
+ } else {
+ /*
+ * For hugetlb pages, try_to_unmap could potentially call
+ * huge_pmd_unshare. Because of this, take semaphore in
+ * write mode here and set TTU_RMAP_LOCKED to indicate we
+ * have taken the lock at this higer level.
+ */
+ i_mmap_lock_write(mapping);
+ unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ }
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
diff --git a/mm/migrate.c b/mm/migrate.c
index 84381b55b2bd..725edaef238a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1307,8 +1307,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
+ struct address_space *mapping = page_mapping(hpage);
+
+ /*
+ * try_to_unmap could potentially call huge_pmd_unshare.
+ * Because of this, take semaphore in write mode here and
+ * set TTU_RMAP_LOCKED to let lower levels know we have
+ * taken the lock.
+ */
+ i_mmap_lock_write(mapping);
try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+ TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
page_was_mapped = 1;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 85b7f9423352..c566bd552535 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,6 +25,7 @@
* page->flags PG_locked (lock_page)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1374,6 +1375,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
+ *
+ * If called for a huge page, caller must hold i_mmap_rwsem
+ * in write mode as it is possible to call huge_pmd_unshare.
*/
adjust_range_if_pmd_sharing_possible(vma, &start, &end);
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 458acda96f20..48368589f519 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,10 +267,14 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
- * Serialize via hugetlb_fault_mutex
+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
+ * i_mmap_rwsem ensures the dst_pte remains valid even
+ * in the case of shared pmds. fault mutex prevents
+ * races with other faulting threads.
*/
- idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
+ i_mmap_lock_read(mapping);
+ idx = linear_page_index(dst_vma, dst_addr);
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -279,6 +283,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -286,6 +291,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
dst_pteval = huge_ptep_get(dst_pte);
if (!huge_pte_none(dst_pteval)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -293,6 +299,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
dst_addr, src_addr, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;
cond_resched();
--
2.17.2
The xfstests generic/475 test switches the underlying device with
dm-error while running a stress test. This results in a large number
of file system errors, and since we can't lock the buffer head when
marking the superblock dirty in the ext4_grp_locked_error() case, it's
possible the superblock to be !buffer_uptodate() without
buffer_write_io_error() being true.
We need to set buffer_uptodate() before we call mark_buffer_dirty() or
this will trigger a WARN_ON. It's safe to do this since the
superblock must have been properly read into memory or the mount would
have been successful. So if buffer_uptodate() is not set, we can
safely assume that this happened due to a failed attempt to write the
superblock.
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Cc: stable(a)vger.kernel.org
---
fs/ext4/super.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d6c142d73d99..fb12d3c17c1b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4902,7 +4902,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
ext4_superblock_csum_set(sb);
if (sync)
lock_buffer(sbh);
- if (buffer_write_io_error(sbh)) {
+ if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
* superblock failed. This could happen because the
--
2.19.1
We have got a team of professional to do image editing service and it will
help you submitted your project timely.
We have 20 image editors and if you want to know more about us please reply
back.
If you want to check our quality of work please send us a photo with
instruction and we will work on it.
Our Services:
Image cutting out
Color, brightness
Beauty, Model retouching, skin retouching.
Image cropping and resizing
Correcting the shape and size
Our Service is 24-48 hours but we can deliver the images sooner in case of
emergency.
We do unlimited revisions until you are satisfied with the work.
Thanks,
Lucy King
Delay the drm_modeset_acquire_init() until after we check for an
allocation failure so that we can return immediately upon error without
having to unwind.
WARNING: lock held when returning to user space!
4.20.0+ #174 Not tainted
------------------------------------------------
syz-executor556/8153 is leaving the kernel with locks still held!
1 lock held by syz-executor556/8153:
#0: 000000005100c85c (crtc_ww_class_acquire){+.+.}, at:
set_property_atomic+0xb3/0x330 drivers/gpu/drm/drm_mode_object.c:462
Reported-by: syzbot+6ea337c427f5083ebdf2(a)syzkaller.appspotmail.com
Fixes: 144a7999d633 ("drm: Handle properties in the core for atomic drivers")
Signed-off-by: Chris Wilson <chris(a)chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter(a)ffwll.ch>
Cc: Maarten Lankhorst <maarten.lankhorst(a)linux.intel.com>
Cc: Sean Paul <sean(a)poorly.run>
Cc: David Airlie <airlied(a)linux.ie>
Cc: <stable(a)vger.kernel.org> # v4.14+
---
drivers/gpu/drm/drm_mode_object.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/drm_mode_object.c b/drivers/gpu/drm/drm_mode_object.c
index bb1dd46496cd..a9005c1c2384 100644
--- a/drivers/gpu/drm/drm_mode_object.c
+++ b/drivers/gpu/drm/drm_mode_object.c
@@ -459,12 +459,13 @@ static int set_property_atomic(struct drm_mode_object *obj,
struct drm_modeset_acquire_ctx ctx;
int ret;
- drm_modeset_acquire_init(&ctx, 0);
-
state = drm_atomic_state_alloc(dev);
if (!state)
return -ENOMEM;
+
+ drm_modeset_acquire_init(&ctx, 0);
state->acquire_ctx = &ctx;
+
retry:
if (prop == state->dev->mode_config.dpms_property) {
if (obj->type != DRM_MODE_OBJECT_CONNECTOR) {
--
2.20.1
Commit 33a48ab105a7 ("ibmveth: Fix DMA unmap error") fixed an issue in the
normal code path of ibmveth_xmit_start() that was originally introduced by
Commit 6e8ab30ec677 ("ibmveth: Add scatter-gather support"). This original
fix missed the error path where dma_unmap_page is wrongly called on the
header portion in descs[0] which was mapped with dma_map_single. As a
result a failure to DMA map any of the frags results in a dmesg warning
when CONFIG_DMA_API_DEBUG is enabled.
------------[ cut here ]------------
DMA-API: ibmveth 30000002: device driver frees DMA memory with wrong function
[device address=0x000000000a430000] [size=172 bytes] [mapped as page] [unmapped as single]
WARNING: CPU: 1 PID: 8426 at kernel/dma/debug.c:1085 check_unmap+0x4fc/0xe10
...
<snip>
...
DMA-API: Mapped at:
ibmveth_start_xmit+0x30c/0xb60
dev_hard_start_xmit+0x100/0x450
sch_direct_xmit+0x224/0x490
__qdisc_run+0x20c/0x980
__dev_queue_xmit+0x1bc/0xf20
This fixes the API misuse by unampping descs[0] with dma_unmap_single.
Fixes: 6e8ab30ec677 ("ibmveth: Add scatter-gather support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Tyrel Datwyler <tyreld(a)linux.vnet.ibm.com>
---
drivers/net/ethernet/ibm/ibmveth.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index a468178..098d876 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1171,11 +1171,15 @@ static netdev_tx_t ibmveth_start_xmit(struct sk_buff *skb,
map_failed_frags:
last = i+1;
- for (i = 0; i < last; i++)
+ for (i = 1; i < last; i++)
dma_unmap_page(&adapter->vdev->dev, descs[i].fields.address,
descs[i].fields.flags_len & IBMVETH_BUF_LEN_MASK,
DMA_TO_DEVICE);
+ dma_unmap_single(&adapter->vdev->dev,
+ descs[0].fields.address,
+ descs[0].fields.flags_len & IBMVETH_BUF_LEN_MASK,
+ DMA_TO_DEVICE);
map_failed:
if (!firmware_has_feature(FW_FEATURE_CMO))
netdev_err(netdev, "tx: unable to map xmit buffer\n");
--
2.7.4
The patch titled
Subject: lib/gen_crc64table.c: don't depend on linux headers being installed
has been added to the -mm tree. Its filename is
lib-dont-depend-on-linux-headers-being-installed.patch
This patch should soon appear at
http://ozlabs.org/~akpm/mmots/broken-out/lib-dont-depend-on-linux-headers-b…
and later at
http://ozlabs.org/~akpm/mmotm/broken-out/lib-dont-depend-on-linux-headers-b…
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next and is updated
there every 3-4 working days
------------------------------------------------------
From: NeilBrown <neil(a)brown.name>
Subject: lib/gen_crc64table.c: don't depend on linux headers being installed
gen_crc64table requires linux include files to be installed in
/usr/include/linux. This is a new requrement so hosts that could
previously build the kernel, now cannot.
gen_crc64table makes this requirement by including <linux/swab.h>, but
nothing from that header is actually used.
So remove the #include, so that the linux headers no longer need to be
installed.
Link: http://lkml.kernel.org/r/87y3899gzi.fsf@notabene.neil.brown.name
Fixes: feba04fd2cf8 ("lib: add crc64 calculation routines")
Signed-off-by: NeilBrown <neil(a)brown.name>
Acked-by: Coly Li <colyli(a)suse.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
--- a/lib/gen_crc64table.c~lib-dont-depend-on-linux-headers-being-installed
+++ a/lib/gen_crc64table.c
@@ -16,8 +16,6 @@
#include <inttypes.h>
#include <stdio.h>
-#include <linux/swab.h>
-
#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL
static uint64_t crc64_table[256] = {0};
_
Patches currently in -mm which might be from neil(a)brown.name are
lib-dont-depend-on-linux-headers-being-installed.patch