From: Zi Yan <ziy(a)nvidia.com>
In isolate_migratepages_block, when cc->alloc_contig is true, we are
able to isolate compound pages, nr_migratepages and nr_isolated did not
count compound pages correctly, causing us to isolate more pages than we
thought. Count compound pages as the number of base pages they contain.
Otherwise, we might be trapped in too_many_isolated while loop,
since the actual isolated pages can go up to
COMPACT_CLUSTER_MAX*512=16384, where COMPACT_CLUSTER_MAX is 32,
since we stop isolation after cc->nr_migratepages reaches to
COMPACT_CLUSTER_MAX.
In addition, after we fix the issue above, cc->nr_migratepages could
never be equal to COMPACT_CLUSTER_MAX if compound pages are isolated,
thus page isolation could not stop as we intended. Change the isolation
stop condition to >=.
The issue can be triggered as follows:
In a system with 16GB memory and an 8GB CMA region reserved by
hugetlb_cma, if we first allocate 10GB THPs and mlock them
(so some THPs are allocated in the CMA region and mlocked), reserving
6 1GB hugetlb pages via
/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages will get stuck
(looping in too_many_isolated function) until we kill either task.
With the patch applied, oom will kill the application with 10GB THPs and
let hugetlb page reservation finish.
Fixes: 1da2f328fa64 (“mm,thp,compaction,cma: allow THP migration for CMA allocations”)
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
---
mm/compaction.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mm/compaction.c b/mm/compaction.c
index ee1f8439369e..3e834ac402f1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1012,8 +1012,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
isolate_success:
list_add(&page->lru, &cc->migratepages);
- cc->nr_migratepages++;
- nr_isolated++;
+ cc->nr_migratepages += compound_nr(page);
+ nr_isolated += compound_nr(page);
/*
* Avoid isolating too much unless this block is being
@@ -1021,7 +1021,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* or a lock is contended. For contention, isolate quickly to
* potentially remove one source of contention.
*/
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX &&
!cc->rescan && !cc->contended) {
++low_pfn;
break;
@@ -1132,7 +1132,7 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
if (!pfn)
break;
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
+ if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX)
break;
}
--
2.28.0
We've fixed many races in panfrost_job_timedout() but some remain.
Instead of trying to fix it again, let's simplify the logic and move
the reset bits to a separate work scheduled when one of the queue
reports a timeout.
v2:
- Use atomic_cmpxchg() to conditionally schedule the reset work (Steven Privce)
Fixes: 1a11a88cfd9a ("drm/panfrost: Fix job timeout handling")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Boris Brezillon <boris.brezillon(a)collabora.com>
---
drivers/gpu/drm/panfrost/panfrost_device.c | 1 -
drivers/gpu/drm/panfrost/panfrost_device.h | 6 +-
drivers/gpu/drm/panfrost/panfrost_job.c | 127 ++++++++++++---------
3 files changed, 79 insertions(+), 55 deletions(-)
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.c b/drivers/gpu/drm/panfrost/panfrost_device.c
index ea8d31863c50..a83b2ff5837a 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.c
+++ b/drivers/gpu/drm/panfrost/panfrost_device.c
@@ -200,7 +200,6 @@ int panfrost_device_init(struct panfrost_device *pfdev)
struct resource *res;
mutex_init(&pfdev->sched_lock);
- mutex_init(&pfdev->reset_lock);
INIT_LIST_HEAD(&pfdev->scheduled_jobs);
INIT_LIST_HEAD(&pfdev->as_lru_list);
diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h
index 140e004a3790..597cf1459b0a 100644
--- a/drivers/gpu/drm/panfrost/panfrost_device.h
+++ b/drivers/gpu/drm/panfrost/panfrost_device.h
@@ -106,7 +106,11 @@ struct panfrost_device {
struct panfrost_perfcnt *perfcnt;
struct mutex sched_lock;
- struct mutex reset_lock;
+
+ struct {
+ struct work_struct work;
+ atomic_t pending;
+ } reset;
struct mutex shrinker_lock;
struct list_head shrinker_list;
diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 4902bc6624c8..14c11293791e 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -20,6 +20,8 @@
#include "panfrost_gpu.h"
#include "panfrost_mmu.h"
+#define JOB_TIMEOUT_MS 500
+
#define job_write(dev, reg, data) writel(data, dev->iomem + (reg))
#define job_read(dev, reg) readl(dev->iomem + (reg))
@@ -382,19 +384,37 @@ static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
drm_sched_increase_karma(bad);
queue->stopped = true;
stopped = true;
+
+ /*
+ * Set the timeout to max so the timer doesn't get started
+ * when we return from the timeout handler (restored in
+ * panfrost_scheduler_start()).
+ */
+ queue->sched.timeout = MAX_SCHEDULE_TIMEOUT;
}
mutex_unlock(&queue->lock);
return stopped;
}
+static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
+{
+ if (WARN_ON(!queue->stopped))
+ return;
+
+ mutex_lock(&queue->lock);
+ /* Restore the original timeout before starting the scheduler. */
+ queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
+ drm_sched_start(&queue->sched, true);
+ queue->stopped = false;
+ mutex_unlock(&queue->lock);
+}
+
static void panfrost_job_timedout(struct drm_sched_job *sched_job)
{
struct panfrost_job *job = to_panfrost_job(sched_job);
struct panfrost_device *pfdev = job->pfdev;
int js = panfrost_job_get_slot(job);
- unsigned long flags;
- int i;
/*
* If the GPU managed to complete this jobs fence, the timeout is
@@ -415,56 +435,9 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
return;
- if (!mutex_trylock(&pfdev->reset_lock))
- return;
-
- for (i = 0; i < NUM_JOB_SLOTS; i++) {
- struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
-
- /*
- * If the queue is still active, make sure we wait for any
- * pending timeouts.
- */
- if (!pfdev->js->queue[i].stopped)
- cancel_delayed_work_sync(&sched->work_tdr);
-
- /*
- * If the scheduler was not already stopped, there's a tiny
- * chance a timeout has expired just before we stopped it, and
- * drm_sched_stop() does not flush pending works. Let's flush
- * them now so the timeout handler doesn't get called in the
- * middle of a reset.
- */
- if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
- cancel_delayed_work_sync(&sched->work_tdr);
-
- /*
- * Now that we cancelled the pending timeouts, we can safely
- * reset the stopped state.
- */
- pfdev->js->queue[i].stopped = false;
- }
-
- spin_lock_irqsave(&pfdev->js->job_lock, flags);
- for (i = 0; i < NUM_JOB_SLOTS; i++) {
- if (pfdev->jobs[i]) {
- pm_runtime_put_noidle(pfdev->dev);
- panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
- pfdev->jobs[i] = NULL;
- }
- }
- spin_unlock_irqrestore(&pfdev->js->job_lock, flags);
-
- panfrost_device_reset(pfdev);
-
- for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
-
- mutex_unlock(&pfdev->reset_lock);
-
- /* restart scheduler after GPU is usable again */
- for (i = 0; i < NUM_JOB_SLOTS; i++)
- drm_sched_start(&pfdev->js->queue[i].sched, true);
+ /* Schedule a reset if there's no reset in progress. */
+ if (!atomic_cmpxchg(&pfdev->reset.pending, 0, 1))
+ schedule_work(&pfdev->reset.work);
}
static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -531,11 +504,59 @@ static irqreturn_t panfrost_job_irq_handler(int irq, void *data)
return IRQ_HANDLED;
}
+static void panfrost_reset(struct work_struct *work)
+{
+ struct panfrost_device *pfdev = container_of(work,
+ struct panfrost_device,
+ reset.work);
+ unsigned long flags;
+ unsigned int i;
+
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ /*
+ * We want pending timeouts to be handled before we attempt
+ * to stop the scheduler. If we don't do that and the timeout
+ * handler is in flight, it might have removed the bad job
+ * from the list, and we'll lose this job if the reset handler
+ * enters the critical section in panfrost_scheduler_stop()
+ * before the timeout handler.
+ *
+ * Timeout is set to max to make sure the timer is not
+ * restarted after the cancellation.
+ */
+ pfdev->js->queue[i].sched.timeout = MAX_SCHEDULE_TIMEOUT;
+ cancel_delayed_work_sync(&pfdev->js->queue[i].sched.work_tdr);
+ panfrost_scheduler_stop(&pfdev->js->queue[i], NULL);
+ }
+
+ /* All timers have been stopped, we can safely reset the pending state. */
+ atomic_set(&pfdev->reset.pending, 0);
+
+ spin_lock_irqsave(&pfdev->js->job_lock, flags);
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ if (pfdev->jobs[i]) {
+ pm_runtime_put_noidle(pfdev->dev);
+ panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
+ pfdev->jobs[i] = NULL;
+ }
+ }
+ spin_unlock_irqrestore(&pfdev->js->job_lock, flags);
+
+ panfrost_device_reset(pfdev);
+
+ for (i = 0; i < NUM_JOB_SLOTS; i++) {
+ drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
+ panfrost_scheduler_start(&pfdev->js->queue[i]);
+ }
+}
+
int panfrost_job_init(struct panfrost_device *pfdev)
{
struct panfrost_job_slot *js;
int ret, j, irq;
+ INIT_WORK(&pfdev->reset.work, panfrost_reset);
+
pfdev->js = js = devm_kzalloc(pfdev->dev, sizeof(*js), GFP_KERNEL);
if (!js)
return -ENOMEM;
@@ -560,7 +581,7 @@ int panfrost_job_init(struct panfrost_device *pfdev)
ret = drm_sched_init(&js->queue[j].sched,
&panfrost_sched_ops,
- 1, 0, msecs_to_jiffies(500),
+ 1, 0, msecs_to_jiffies(JOB_TIMEOUT_MS),
"pan_js");
if (ret) {
dev_err(pfdev->dev, "Failed to create scheduler: %d.", ret);
--
2.26.2
Commit 393f203f5fd5 ("x86_64: kasan: add interceptors for
memset/memmove/memcpy functions") added .weak directives to
arch/x86/lib/mem*_64.S instead of changing the existing SYM_FUNC_START_*
macros. This can lead to the assembly snippet `.weak memcpy ... .globl
memcpy` which will produce a STB_WEAK memcpy with GNU as but STB_GLOBAL
memcpy with LLVM's integrated assembler before LLVM 12. LLVM 12 (since
https://reviews.llvm.org/D90108) will error on such an overridden symbol
binding.
Use the appropriate SYM_FUNC_START_WEAK instead.
Fixes: 393f203f5fd5 ("x86_64: kasan: add interceptors for memset/memmove/memcpy functions")
Reported-by: Sami Tolvanen <samitolvanen(a)google.com>
Signed-off-by: Fangrui Song <maskray(a)google.com>
Cc: <stable(a)vger.kernel.org>
---
arch/x86/lib/memcpy_64.S | 4 +---
arch/x86/lib/memmove_64.S | 4 +---
arch/x86/lib/memset_64.S | 4 +---
3 files changed, 3 insertions(+), 9 deletions(-)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 037faac46b0c..1e299ac73c86 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -16,8 +16,6 @@
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
-.weak memcpy
-
/*
* memcpy - Copy a memory block.
*
@@ -30,7 +28,7 @@
* rax original destination
*/
SYM_FUNC_START_ALIAS(__memcpy)
-SYM_FUNC_START_LOCAL(memcpy)
+SYM_FUNC_START_WEAK(memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memcpy_erms", X86_FEATURE_ERMS
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 7ff00ea64e4f..41902fe8b859 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,9 +24,7 @@
* Output:
* rax: dest
*/
-.weak memmove
-
-SYM_FUNC_START_ALIAS(memmove)
+SYM_FUNC_START_WEAK(memmove)
SYM_FUNC_START(__memmove)
mov %rdi, %rax
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9ff15ee404a4..0bfd26e4ca9e 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -6,8 +6,6 @@
#include <asm/alternative-asm.h>
#include <asm/export.h>
-.weak memset
-
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
@@ -19,7 +17,7 @@
*
* rax original destination
*/
-SYM_FUNC_START_ALIAS(memset)
+SYM_FUNC_START_WEAK(memset)
SYM_FUNC_START(__memset)
/*
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
--
2.29.1.341.ge80a0c044ae-goog
A: Because it messes up the order in which people normally read text.
Q: Why is top-posting such a bad thing?
A: Top-posting.
Q: What is the most annoying thing in e-mail?
A: No.
Q: Should I include quotations after my reply?
http://daringfireball.net/2007/07/on_top
On Thu, Oct 29, 2020 at 08:23:57AM -0700, Chris Ye wrote:
> Haha, thanks!
> I have fixed my git config and sent a new mail, can you check it?
Do you have a pointer to it on lore.kernel.org?
And you did properly version the patch, right?
thanks,
greg k-h
Backport some fscrypt fixes from upstream 5.2 to 4.19-stable.
This is needed to get 'kvm-xfstests -c ext4,f2fs,ubifs -g encrypt' to
fully pass on 4.19-stable. Before, generic/397 and generic/429 failed
on UBIFS due to missing "fscrypt: fix race where ->lookup() marks
plaintext dentry as ciphertext".
This also fixes some bugs that aren't yet covered by the xfstests.
E.g., "fs, fscrypt: clear DCACHE_ENCRYPTED_NAME when unaliasing
directory" fixes a bug that caused real-world problems on Chrome OS.
Some relatively straightforward adjustments were needed to the patches,
mainly due to the refactoring of fscrypt.h that was done in 5.1.
Eric Biggers (5):
fscrypt: clean up and improve dentry revalidation
fscrypt: fix race allowing rename() and link() of ciphertext dentries
fs, fscrypt: clear DCACHE_ENCRYPTED_NAME when unaliasing directory
fscrypt: only set dentry_operations on ciphertext dentries
fscrypt: fix race where ->lookup() marks plaintext dentry as
ciphertext
fs/crypto/crypto.c | 58 +++++++++++++------------
fs/crypto/fname.c | 1 +
fs/crypto/hooks.c | 28 ++++++++----
fs/dcache.c | 15 +++++++
fs/ext4/ext4.h | 62 ++++++++++++++++++++-------
fs/ext4/namei.c | 76 ++++++++++++++++++++++-----------
fs/f2fs/namei.c | 17 +++++---
fs/ubifs/dir.c | 8 ++--
include/linux/dcache.h | 2 +-
include/linux/fscrypt.h | 30 +++++++------
include/linux/fscrypt_notsupp.h | 9 ++--
include/linux/fscrypt_supp.h | 6 ++-
12 files changed, 205 insertions(+), 107 deletions(-)
--
2.29.1
Backport some fscrypt fixes from 4.10 and 4.11 to 4.9-stable.
These will be needed for xfstest generic/395 to pass if
https://lkml.kernel.org/fstests/20201031054018.695314-1-ebiggers@kernel.org
is applied.
These are clean cherry picks.
Tested with 'kvm-xfstests -c ext4,f2fs -g encrypt'.
Eric Biggers (2):
fscrypto: move ioctl processing more fully into common code
fscrypt: use EEXIST when file already uses different policy
fs/crypto/policy.c | 36 ++++++++++++++++++++++--------------
fs/ext4/ext4.h | 4 ++--
fs/ext4/ioctl.c | 34 +++++-----------------------------
fs/f2fs/f2fs.h | 4 ++--
fs/f2fs/file.c | 19 ++-----------------
include/linux/fscrypto.h | 12 ++++++------
6 files changed, 39 insertions(+), 70 deletions(-)
--
2.29.1
From: Chao Yu <yuchao0(a)huawei.com>
commit e06f86e61d7a67fe6e826010f57aa39c674f4b1b upstream.
[This backport fixes a regression in 4.4-stable caused by commit
11a6e8f89521 ("f2fs: check memory boundary by insane namelen"), which
depended on this missing commit. This bad backport broke f2fs
encryption because it moved the incrementing of 'bit_pos' to earlier in
f2fs_fill_dentries() without accounting for it being used in the
encrypted dir case. This caused readdir() on encrypted directories to
start failing. Tested with 'kvm-xfstests -c f2fs -g encrypt'.]
When decrypting dirents in ->readdir, fscrypt_fname_disk_to_usr won't
change content of original encrypted dirent, we don't need to allocate
additional buffer for storing mirror of it, so get rid of it.
Signed-off-by: Chao Yu <yuchao0(a)huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/f2fs/dir.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index e2ff0eb16f89c..c1130914d6ed7 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -820,15 +820,8 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
int save_len = fstr->len;
int ret;
- de_name.name = kmalloc(de_name.len, GFP_NOFS);
- if (!de_name.name)
- return false;
-
- memcpy(de_name.name, d->filename[bit_pos], de_name.len);
-
ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
&de_name, fstr);
- kfree(de_name.name);
if (ret < 0)
return true;
--
2.29.1