The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 41bd60676923822de1df2c50b3f9a10171f4338a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Wed, 28 Nov 2018 14:54:28 +0000
Subject: [PATCH] Btrfs: fix fsync of files with multiple hard links in new
directories
The log tree has a long standing problem that when a file is fsync'ed we
only check for new ancestors, created in the current transaction, by
following only the hard link for which the fsync was issued. We follow the
ancestors using the VFS' dget_parent() API. This means that if we create a
new link for a file in a directory that is new (or in an any other new
ancestor directory) and then fsync the file using an old hard link, we end
up not logging the new ancestor, and on log replay that new hard link and
ancestor do not exist. In some cases, involving renames, the file will not
exist at all.
Example:
mkfs.btrfs -f /dev/sdb
mount /dev/sdb /mnt
mkdir /mnt/A
touch /mnt/foo
ln /mnt/foo /mnt/A/bar
xfs_io -c fsync /mnt/foo
<power failure>
In this example after log replay only the hard link named 'foo' exists
and directory A does not exist, which is unexpected. In other major linux
filesystems, such as ext4, xfs and f2fs for example, both hard links exist
and so does directory A after mounting again the filesystem.
Checking if any new ancestors are new and need to be logged was added in
2009 by commit 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes"),
however only for the ancestors of the hard link (dentry) for which the
fsync was issued, instead of checking for all ancestors for all of the
inode's hard links.
So fix this by tracking the id of the last transaction where a hard link
was created for an inode and then on fsync fallback to a full transaction
commit when an inode has more than one hard link and at least one new hard
link was created in the current transaction. This is the simplest solution
since this is not a common use case (adding frequently hard links for
which there's an ancestor created in the current transaction and then
fsync the file). In case it ever becomes a common use case, a solution
that consists of iterating the fs/subvol btree for each hard link and
check if any ancestor is new, could be implemented.
This solves many unexpected scenarios reported by Jayashree Mohan and
Vijay Chidambaram, and for which there is a new test case for fstests
under review.
Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
CC: stable(a)vger.kernel.org # 4.4+
Reported-by: Vijay Chidambaram <vvijay03(a)gmail.com>
Reported-by: Jayashree Mohan <jayashree2912(a)gmail.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fc25607304f2..6f5d07415dab 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -147,6 +147,12 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
+ /*
+ * Track the transaction id of the last transaction used to create a
+ * hard link for the inode. This is used by the log tree (fsync).
+ */
+ u64 last_link_trans;
+
/*
* Number of bytes outstanding that are going to need csums. This is
* used in ENOSPC accounting.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d54bdef16d8d..b4129d9072ec 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3658,6 +3658,21 @@ static int btrfs_read_locked_inode(struct inode *inode,
* inode is not a directory, logging its parent unnecessarily.
*/
BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ /*
+ * Similar reasoning for last_link_trans, needs to be set otherwise
+ * for a case like the following:
+ *
+ * mkdir A
+ * touch foo
+ * ln foo A/bar
+ * echo 2 > /proc/sys/vm/drop_caches
+ * fsync foo
+ * <power failure>
+ *
+ * Would result in link bar and directory A not existing after the power
+ * failure.
+ */
+ BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
path->slots[0]++;
if (inode->i_nlink != 1 ||
@@ -6597,6 +6612,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
goto fail;
}
+ BTRFS_I(inode)->last_link_trans = trans->transid;
d_instantiate(dentry, inode);
ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
true, NULL);
@@ -9123,6 +9139,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->dir_index = 0;
ei->last_unlink_trans = 0;
+ ei->last_link_trans = 0;
ei->last_log_commit = 0;
spin_lock_init(&ei->lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 013d0abcd46b..5baad9bebc62 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5758,6 +5758,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
+ /*
+ * If a new hard link was added to the inode in the current transaction
+ * and its link count is now greater than 1, we need to fallback to a
+ * transaction commit, otherwise we can end up not logging all its new
+ * parents for all the hard links. Here just from the dentry used to
+ * fsync, we can not visit the ancestor inodes for all the other hard
+ * links to figure out if any is new, so we fallback to a transaction
+ * commit (instead of adding a lot of complexity of scanning a btree,
+ * since this scenario is not a common use case).
+ */
+ if (inode->vfs_inode.i_nlink > 1 &&
+ inode->last_link_trans > last_committed) {
+ ret = -EMLINK;
+ goto end_trans;
+ }
+
while (1) {
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
The patch below does not apply to the 4.20-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9349e23907be1954ccdf6d771d640e2788da1643 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv(a)altlinux.org>
Date: Thu, 1 Nov 2018 14:03:08 +0300
Subject: [PATCH] uapi: fix linux/kfd_ioctl.h userspace compilation errors
Consistently use types provided by <linux/types.h> via <drm/drm.h>
to fix the following linux/kfd_ioctl.h userspace compilation errors:
/usr/include/linux/kfd_ioctl.h:250:2: error: unknown type name 'uint32_t'
uint32_t reset_type;
/usr/include/linux/kfd_ioctl.h:251:2: error: unknown type name 'uint32_t'
uint32_t reset_cause;
/usr/include/linux/kfd_ioctl.h:252:2: error: unknown type name 'uint32_t'
uint32_t memory_lost;
/usr/include/linux/kfd_ioctl.h:253:2: error: unknown type name 'uint32_t'
uint32_t gpu_id;
Fixes: 0c119abad7f0d ("drm/amd: Add kfd ioctl defines for hw_exception event")
Cc: <stable(a)vger.kernel.org> # v4.19
Signed-off-by: Dmitry V. Levin <ldv(a)altlinux.org>
Reviewed-by: Felix Kuehling <Felix.Kuehling(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index f5ff8a76e208..dae897f38e59 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -255,10 +255,10 @@ struct kfd_hsa_memory_exception_data {
/* hw exception data */
struct kfd_hsa_hw_exception_data {
- uint32_t reset_type;
- uint32_t reset_cause;
- uint32_t memory_lost;
- uint32_t gpu_id;
+ __u32 reset_type;
+ __u32 reset_cause;
+ __u32 memory_lost;
+ __u32 gpu_id;
};
/* Event data */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9349e23907be1954ccdf6d771d640e2788da1643 Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv(a)altlinux.org>
Date: Thu, 1 Nov 2018 14:03:08 +0300
Subject: [PATCH] uapi: fix linux/kfd_ioctl.h userspace compilation errors
Consistently use types provided by <linux/types.h> via <drm/drm.h>
to fix the following linux/kfd_ioctl.h userspace compilation errors:
/usr/include/linux/kfd_ioctl.h:250:2: error: unknown type name 'uint32_t'
uint32_t reset_type;
/usr/include/linux/kfd_ioctl.h:251:2: error: unknown type name 'uint32_t'
uint32_t reset_cause;
/usr/include/linux/kfd_ioctl.h:252:2: error: unknown type name 'uint32_t'
uint32_t memory_lost;
/usr/include/linux/kfd_ioctl.h:253:2: error: unknown type name 'uint32_t'
uint32_t gpu_id;
Fixes: 0c119abad7f0d ("drm/amd: Add kfd ioctl defines for hw_exception event")
Cc: <stable(a)vger.kernel.org> # v4.19
Signed-off-by: Dmitry V. Levin <ldv(a)altlinux.org>
Reviewed-by: Felix Kuehling <Felix.Kuehling(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index f5ff8a76e208..dae897f38e59 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -255,10 +255,10 @@ struct kfd_hsa_memory_exception_data {
/* hw exception data */
struct kfd_hsa_hw_exception_data {
- uint32_t reset_type;
- uint32_t reset_cause;
- uint32_t memory_lost;
- uint32_t gpu_id;
+ __u32 reset_type;
+ __u32 reset_cause;
+ __u32 memory_lost;
+ __u32 gpu_id;
};
/* Event data */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2cd4bd192ee94848695c1c052d87913260e10f36 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram(a)us.ibm.com>
Date: Thu, 20 Dec 2018 12:03:30 -0800
Subject: [PATCH] powerpc/pkeys: Fix handling of pkey state across fork()
Protection key tracking information is not copied over to the
mm_struct of the child during fork(). This can cause the child to
erroneously allocate keys that were already allocated. Any allocated
execute-only key is lost aswell.
Add code; called by dup_mmap(), to copy the pkey state from parent to
child explicitly.
This problem was originally found by Dave Hansen on x86, which turns
out to be a problem on powerpc aswell.
Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem")
Cc: stable(a)vger.kernel.org # v4.16+
Reviewed-by: Thiago Jung Bauermann <bauerman(a)linux.ibm.com>
Signed-off-by: Ram Pai <linuxram(a)us.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index c05efd2e8736..e687ed31d85a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -217,12 +217,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
#endif
}
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
-{
- return 0;
-}
-
#ifdef CONFIG_PPC_BOOK3E_64
static inline void arch_exit_mmap(struct mm_struct *mm)
{
@@ -247,6 +241,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
#ifdef CONFIG_PPC_MEM_KEYS
bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
bool execute, bool foreign);
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm);
#else /* CONFIG_PPC_MEM_KEYS */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
@@ -259,6 +254,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
#define thread_pkey_regs_save(thread)
#define thread_pkey_regs_restore(new_thread, old_thread)
#define thread_pkey_regs_init(thread)
+#define arch_dup_pkeys(oldmm, mm)
static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
{
@@ -267,5 +263,12 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
#endif /* CONFIG_PPC_MEM_KEYS */
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+ arch_dup_pkeys(oldmm, mm);
+ return 0;
+}
+
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 04b60a8f6e69..587807763737 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -415,3 +415,13 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
return pkey_access_permitted(vma_pkey(vma), write, execute);
}
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
The patch below does not apply to the 4.20-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 2cd4bd192ee94848695c1c052d87913260e10f36 Mon Sep 17 00:00:00 2001
From: Ram Pai <linuxram(a)us.ibm.com>
Date: Thu, 20 Dec 2018 12:03:30 -0800
Subject: [PATCH] powerpc/pkeys: Fix handling of pkey state across fork()
Protection key tracking information is not copied over to the
mm_struct of the child during fork(). This can cause the child to
erroneously allocate keys that were already allocated. Any allocated
execute-only key is lost aswell.
Add code; called by dup_mmap(), to copy the pkey state from parent to
child explicitly.
This problem was originally found by Dave Hansen on x86, which turns
out to be a problem on powerpc aswell.
Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem")
Cc: stable(a)vger.kernel.org # v4.16+
Reviewed-by: Thiago Jung Bauermann <bauerman(a)linux.ibm.com>
Signed-off-by: Ram Pai <linuxram(a)us.ibm.com>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index c05efd2e8736..e687ed31d85a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -217,12 +217,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
#endif
}
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
-{
- return 0;
-}
-
#ifdef CONFIG_PPC_BOOK3E_64
static inline void arch_exit_mmap(struct mm_struct *mm)
{
@@ -247,6 +241,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
#ifdef CONFIG_PPC_MEM_KEYS
bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
bool execute, bool foreign);
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm);
#else /* CONFIG_PPC_MEM_KEYS */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
@@ -259,6 +254,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
#define thread_pkey_regs_save(thread)
#define thread_pkey_regs_restore(new_thread, old_thread)
#define thread_pkey_regs_init(thread)
+#define arch_dup_pkeys(oldmm, mm)
static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
{
@@ -267,5 +263,12 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
#endif /* CONFIG_PPC_MEM_KEYS */
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+ arch_dup_pkeys(oldmm, mm);
+ return 0;
+}
+
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 04b60a8f6e69..587807763737 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -415,3 +415,13 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
return pkey_access_permitted(vma_pkey(vma), write, execute);
}
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ if (static_branch_likely(&pkey_disabled))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
The patch below does not apply to the 3.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6f5b9f018f4c7686fd944d920209d1382d320e4e Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Mon, 26 Nov 2018 18:12:00 -0200
Subject: [PATCH] powerpc/tm: Unset MSR[TS] if not recheckpointing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is a TM Bad Thing bug that can be caused when you return from a
signal context in a suspended transaction but with ucontext MSR[TS] unset.
This forces regs->msr[TS] to be set at syscall entrance (since the CPU
state is transactional). It also calls treclaim() to flush the transaction
state, which is done based on the live (mfmsr) MSR state.
Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not
called, thus, not executing recheckpoint, keeping the CPU state as not
transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU
state is non transactional, causing the TM Bad Thing with the following
stack:
[ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c
cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40]
pc: c00000000000c47c: fast_exception_return+0xac/0xb4
lr: 00003fff865f442c
sp: 3fffd9dce3e0
msr: 8000000102a03031
current = 0xc00000041f68b700
paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01
pid = 1721, comm = tm-signal-sigre
Linux version 4.9.0-3-powerpc64le (debian-kernel(a)lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
WARNING: exception is not recoverable, can't continue
The same problem happens on 32-bits signal handler, and the fix is very
similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be
zeroed.
This patch also fixes a sparse warning related to lack of indentation when
CONFIG_PPC_TRANSACTIONAL_MEM is set.
Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context")
CC: Stable <stable(a)vger.kernel.org> # 3.10+
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Tested-by: Michal Suchánek <msuchanek(a)suse.de>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 7484f43493d3..2d47cc79e5b3 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1158,11 +1158,11 @@ SYSCALL_DEFINE0(rt_sigreturn)
{
struct rt_sigframe __user *rt_sf;
struct pt_regs *regs = current_pt_regs();
+ int tm_restore = 0;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
struct ucontext __user *uc_transact;
unsigned long msr_hi;
unsigned long tmp;
- int tm_restore = 0;
#endif
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@@ -1210,11 +1210,19 @@ SYSCALL_DEFINE0(rt_sigreturn)
goto bad;
}
}
- if (!tm_restore)
- /* Fall through, for non-TM restore */
+ if (!tm_restore) {
+ /*
+ * Unset regs->msr because ucontext MSR TS is not
+ * set, and recheckpoint was not called. This avoid
+ * hitting a TM Bad thing at RFID
+ */
+ regs->msr &= ~MSR_TS_MASK;
+ }
+ /* Fall through, for non-TM restore */
#endif
- if (do_setcontext(&rt_sf->uc, regs, 1))
- goto bad;
+ if (!tm_restore)
+ if (do_setcontext(&rt_sf->uc, regs, 1))
+ goto bad;
/*
* It's not clear whether or why it is desirable to save the
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index ba093ec5a21f..0935fe6c282a 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -757,11 +757,23 @@ SYSCALL_DEFINE0(rt_sigreturn)
&uc_transact->uc_mcontext))
goto badframe;
}
- else
- /* Fall through, for non-TM restore */
#endif
- if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
- goto badframe;
+ /* Fall through, for non-TM restore */
+ if (!MSR_TM_ACTIVE(msr)) {
+ /*
+ * Unset MSR[TS] on the thread regs since MSR from user
+ * context does not have MSR active, and recheckpoint was
+ * not called since restore_tm_sigcontexts() was not called
+ * also.
+ *
+ * If not unsetting it, the code can RFID to userspace with
+ * MSR[TS] set, but without CPU in the proper state,
+ * causing a TM bad thing.
+ */
+ current->thread.regs->msr &= ~MSR_TS_MASK;
+ if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
+ goto badframe;
+ }
if (restore_altstack(&uc->uc_stack))
goto badframe;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6f5b9f018f4c7686fd944d920209d1382d320e4e Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Mon, 26 Nov 2018 18:12:00 -0200
Subject: [PATCH] powerpc/tm: Unset MSR[TS] if not recheckpointing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is a TM Bad Thing bug that can be caused when you return from a
signal context in a suspended transaction but with ucontext MSR[TS] unset.
This forces regs->msr[TS] to be set at syscall entrance (since the CPU
state is transactional). It also calls treclaim() to flush the transaction
state, which is done based on the live (mfmsr) MSR state.
Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not
called, thus, not executing recheckpoint, keeping the CPU state as not
transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU
state is non transactional, causing the TM Bad Thing with the following
stack:
[ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c
cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40]
pc: c00000000000c47c: fast_exception_return+0xac/0xb4
lr: 00003fff865f442c
sp: 3fffd9dce3e0
msr: 8000000102a03031
current = 0xc00000041f68b700
paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01
pid = 1721, comm = tm-signal-sigre
Linux version 4.9.0-3-powerpc64le (debian-kernel(a)lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
WARNING: exception is not recoverable, can't continue
The same problem happens on 32-bits signal handler, and the fix is very
similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be
zeroed.
This patch also fixes a sparse warning related to lack of indentation when
CONFIG_PPC_TRANSACTIONAL_MEM is set.
Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context")
CC: Stable <stable(a)vger.kernel.org> # 3.10+
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Tested-by: Michal Suchánek <msuchanek(a)suse.de>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 7484f43493d3..2d47cc79e5b3 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1158,11 +1158,11 @@ SYSCALL_DEFINE0(rt_sigreturn)
{
struct rt_sigframe __user *rt_sf;
struct pt_regs *regs = current_pt_regs();
+ int tm_restore = 0;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
struct ucontext __user *uc_transact;
unsigned long msr_hi;
unsigned long tmp;
- int tm_restore = 0;
#endif
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@@ -1210,11 +1210,19 @@ SYSCALL_DEFINE0(rt_sigreturn)
goto bad;
}
}
- if (!tm_restore)
- /* Fall through, for non-TM restore */
+ if (!tm_restore) {
+ /*
+ * Unset regs->msr because ucontext MSR TS is not
+ * set, and recheckpoint was not called. This avoid
+ * hitting a TM Bad thing at RFID
+ */
+ regs->msr &= ~MSR_TS_MASK;
+ }
+ /* Fall through, for non-TM restore */
#endif
- if (do_setcontext(&rt_sf->uc, regs, 1))
- goto bad;
+ if (!tm_restore)
+ if (do_setcontext(&rt_sf->uc, regs, 1))
+ goto bad;
/*
* It's not clear whether or why it is desirable to save the
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index ba093ec5a21f..0935fe6c282a 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -757,11 +757,23 @@ SYSCALL_DEFINE0(rt_sigreturn)
&uc_transact->uc_mcontext))
goto badframe;
}
- else
- /* Fall through, for non-TM restore */
#endif
- if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
- goto badframe;
+ /* Fall through, for non-TM restore */
+ if (!MSR_TM_ACTIVE(msr)) {
+ /*
+ * Unset MSR[TS] on the thread regs since MSR from user
+ * context does not have MSR active, and recheckpoint was
+ * not called since restore_tm_sigcontexts() was not called
+ * also.
+ *
+ * If not unsetting it, the code can RFID to userspace with
+ * MSR[TS] set, but without CPU in the proper state,
+ * causing a TM bad thing.
+ */
+ current->thread.regs->msr &= ~MSR_TS_MASK;
+ if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
+ goto badframe;
+ }
if (restore_altstack(&uc->uc_stack))
goto badframe;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6f5b9f018f4c7686fd944d920209d1382d320e4e Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Mon, 26 Nov 2018 18:12:00 -0200
Subject: [PATCH] powerpc/tm: Unset MSR[TS] if not recheckpointing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is a TM Bad Thing bug that can be caused when you return from a
signal context in a suspended transaction but with ucontext MSR[TS] unset.
This forces regs->msr[TS] to be set at syscall entrance (since the CPU
state is transactional). It also calls treclaim() to flush the transaction
state, which is done based on the live (mfmsr) MSR state.
Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not
called, thus, not executing recheckpoint, keeping the CPU state as not
transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU
state is non transactional, causing the TM Bad Thing with the following
stack:
[ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c
cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40]
pc: c00000000000c47c: fast_exception_return+0xac/0xb4
lr: 00003fff865f442c
sp: 3fffd9dce3e0
msr: 8000000102a03031
current = 0xc00000041f68b700
paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01
pid = 1721, comm = tm-signal-sigre
Linux version 4.9.0-3-powerpc64le (debian-kernel(a)lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
WARNING: exception is not recoverable, can't continue
The same problem happens on 32-bits signal handler, and the fix is very
similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be
zeroed.
This patch also fixes a sparse warning related to lack of indentation when
CONFIG_PPC_TRANSACTIONAL_MEM is set.
Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context")
CC: Stable <stable(a)vger.kernel.org> # 3.10+
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Tested-by: Michal Suchánek <msuchanek(a)suse.de>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 7484f43493d3..2d47cc79e5b3 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1158,11 +1158,11 @@ SYSCALL_DEFINE0(rt_sigreturn)
{
struct rt_sigframe __user *rt_sf;
struct pt_regs *regs = current_pt_regs();
+ int tm_restore = 0;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
struct ucontext __user *uc_transact;
unsigned long msr_hi;
unsigned long tmp;
- int tm_restore = 0;
#endif
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@@ -1210,11 +1210,19 @@ SYSCALL_DEFINE0(rt_sigreturn)
goto bad;
}
}
- if (!tm_restore)
- /* Fall through, for non-TM restore */
+ if (!tm_restore) {
+ /*
+ * Unset regs->msr because ucontext MSR TS is not
+ * set, and recheckpoint was not called. This avoid
+ * hitting a TM Bad thing at RFID
+ */
+ regs->msr &= ~MSR_TS_MASK;
+ }
+ /* Fall through, for non-TM restore */
#endif
- if (do_setcontext(&rt_sf->uc, regs, 1))
- goto bad;
+ if (!tm_restore)
+ if (do_setcontext(&rt_sf->uc, regs, 1))
+ goto bad;
/*
* It's not clear whether or why it is desirable to save the
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index ba093ec5a21f..0935fe6c282a 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -757,11 +757,23 @@ SYSCALL_DEFINE0(rt_sigreturn)
&uc_transact->uc_mcontext))
goto badframe;
}
- else
- /* Fall through, for non-TM restore */
#endif
- if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
- goto badframe;
+ /* Fall through, for non-TM restore */
+ if (!MSR_TM_ACTIVE(msr)) {
+ /*
+ * Unset MSR[TS] on the thread regs since MSR from user
+ * context does not have MSR active, and recheckpoint was
+ * not called since restore_tm_sigcontexts() was not called
+ * also.
+ *
+ * If not unsetting it, the code can RFID to userspace with
+ * MSR[TS] set, but without CPU in the proper state,
+ * causing a TM bad thing.
+ */
+ current->thread.regs->msr &= ~MSR_TS_MASK;
+ if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
+ goto badframe;
+ }
if (restore_altstack(&uc->uc_stack))
goto badframe;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6f5b9f018f4c7686fd944d920209d1382d320e4e Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Mon, 26 Nov 2018 18:12:00 -0200
Subject: [PATCH] powerpc/tm: Unset MSR[TS] if not recheckpointing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
There is a TM Bad Thing bug that can be caused when you return from a
signal context in a suspended transaction but with ucontext MSR[TS] unset.
This forces regs->msr[TS] to be set at syscall entrance (since the CPU
state is transactional). It also calls treclaim() to flush the transaction
state, which is done based on the live (mfmsr) MSR state.
Since user context MSR[TS] is not set, then restore_tm_sigcontexts() is not
called, thus, not executing recheckpoint, keeping the CPU state as not
transactional. When calling rfid, SRR1 will have MSR[TS] set, but the CPU
state is non transactional, causing the TM Bad Thing with the following
stack:
[ 33.862316] Bad kernel stack pointer 3fffd9dce3e0 at c00000000000c47c
cpu 0x8: Vector: 700 (Program Check) at [c00000003ff7fd40]
pc: c00000000000c47c: fast_exception_return+0xac/0xb4
lr: 00003fff865f442c
sp: 3fffd9dce3e0
msr: 8000000102a03031
current = 0xc00000041f68b700
paca = 0xc00000000fb84800 softe: 0 irq_happened: 0x01
pid = 1721, comm = tm-signal-sigre
Linux version 4.9.0-3-powerpc64le (debian-kernel(a)lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
WARNING: exception is not recoverable, can't continue
The same problem happens on 32-bits signal handler, and the fix is very
similar, if tm_recheckpoint() is not executed, then regs->msr[TS] should be
zeroed.
This patch also fixes a sparse warning related to lack of indentation when
CONFIG_PPC_TRANSACTIONAL_MEM is set.
Fixes: 2b0a576d15e0e ("powerpc: Add new transactional memory state to the signal context")
CC: Stable <stable(a)vger.kernel.org> # 3.10+
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Tested-by: Michal Suchánek <msuchanek(a)suse.de>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 7484f43493d3..2d47cc79e5b3 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -1158,11 +1158,11 @@ SYSCALL_DEFINE0(rt_sigreturn)
{
struct rt_sigframe __user *rt_sf;
struct pt_regs *regs = current_pt_regs();
+ int tm_restore = 0;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
struct ucontext __user *uc_transact;
unsigned long msr_hi;
unsigned long tmp;
- int tm_restore = 0;
#endif
/* Always make any pending restarted system calls return -EINTR */
current->restart_block.fn = do_no_restart_syscall;
@@ -1210,11 +1210,19 @@ SYSCALL_DEFINE0(rt_sigreturn)
goto bad;
}
}
- if (!tm_restore)
- /* Fall through, for non-TM restore */
+ if (!tm_restore) {
+ /*
+ * Unset regs->msr because ucontext MSR TS is not
+ * set, and recheckpoint was not called. This avoid
+ * hitting a TM Bad thing at RFID
+ */
+ regs->msr &= ~MSR_TS_MASK;
+ }
+ /* Fall through, for non-TM restore */
#endif
- if (do_setcontext(&rt_sf->uc, regs, 1))
- goto bad;
+ if (!tm_restore)
+ if (do_setcontext(&rt_sf->uc, regs, 1))
+ goto bad;
/*
* It's not clear whether or why it is desirable to save the
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index ba093ec5a21f..0935fe6c282a 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -757,11 +757,23 @@ SYSCALL_DEFINE0(rt_sigreturn)
&uc_transact->uc_mcontext))
goto badframe;
}
- else
- /* Fall through, for non-TM restore */
#endif
- if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
- goto badframe;
+ /* Fall through, for non-TM restore */
+ if (!MSR_TM_ACTIVE(msr)) {
+ /*
+ * Unset MSR[TS] on the thread regs since MSR from user
+ * context does not have MSR active, and recheckpoint was
+ * not called since restore_tm_sigcontexts() was not called
+ * also.
+ *
+ * If not unsetting it, the code can RFID to userspace with
+ * MSR[TS] set, but without CPU in the proper state,
+ * causing a TM bad thing.
+ */
+ current->thread.regs->msr &= ~MSR_TS_MASK;
+ if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
+ goto badframe;
+ }
if (restore_altstack(&uc->uc_stack))
goto badframe;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From e1c3743e1a20647c53b719dbf28b48f45d23f2cd Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao(a)debian.org>
Date: Wed, 21 Nov 2018 17:21:09 -0200
Subject: [PATCH] powerpc/tm: Set MSR[TS] just prior to recheckpoint
On a signal handler return, the user could set a context with MSR[TS] bits
set, and these bits would be copied to task regs->msr.
At restore_tm_sigcontexts(), after current task regs->msr[TS] bits are set,
several __get_user() are called and then a recheckpoint is executed.
This is a problem since a page fault (in kernel space) could happen when
calling __get_user(). If it happens, the process MSR[TS] bits were
already set, but recheckpoint was not executed, and SPRs are still invalid.
The page fault can cause the current process to be de-scheduled, with
MSR[TS] active and without tm_recheckpoint() being called. More
importantly, without TEXASR[FS] bit set also.
Since TEXASR might not have the FS bit set, and when the process is
scheduled back, it will try to reclaim, which will be aborted because of
the CPU is not in the suspended state, and, then, recheckpoint. This
recheckpoint will restore thread->texasr into TEXASR SPR, which might be
zero, hitting a BUG_ON().
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
cpu 0xb: Vector: 700 (Program Check) at [c00000041f1576d0]
pc: c000000000054550: restore_gprs+0xb0/0x180
lr: 0000000000000000
sp: c00000041f157950
msr: 8000000100021033
current = 0xc00000041f143000
paca = 0xc00000000fb86300 softe: 0 irq_happened: 0x01
pid = 1021, comm = kworker/11:1
kernel BUG at /build/linux-sf3Co9/linux-4.9.30/arch/powerpc/kernel/tm.S:434!
Linux version 4.9.0-3-powerpc64le (debian-kernel(a)lists.debian.org) (gcc version 6.3.0 20170516 (Debian 6.3.0-18) ) #1 SMP Debian 4.9.30-2+deb9u2 (2017-06-26)
enter ? for help
[c00000041f157b30] c00000000001bc3c tm_recheckpoint.part.11+0x6c/0xa0
[c00000041f157b70] c00000000001d184 __switch_to+0x1e4/0x4c0
[c00000041f157bd0] c00000000082eeb8 __schedule+0x2f8/0x990
[c00000041f157cb0] c00000000082f598 schedule+0x48/0xc0
[c00000041f157ce0] c0000000000f0d28 worker_thread+0x148/0x610
[c00000041f157d80] c0000000000f96b0 kthread+0x120/0x140
[c00000041f157e30] c00000000000c0e0 ret_from_kernel_thread+0x5c/0x7c
This patch simply delays the MSR[TS] set, so, if there is any page fault in
the __get_user() section, it does not have regs->msr[TS] set, since the TM
structures are still invalid, thus avoiding doing TM operations for
in-kernel exceptions and possible process reschedule.
With this patch, the MSR[TS] will only be set just before recheckpointing
and setting TEXASR[FS] = 1, thus avoiding an interrupt with TM registers in
invalid state.
Other than that, if CONFIG_PREEMPT is set, there might be a preemption just
after setting MSR[TS] and before tm_recheckpoint(), thus, this block must
be atomic from a preemption perspective, thus, calling
preempt_disable/enable() on this code.
It is not possible to move tm_recheckpoint to happen earlier, because it is
required to get the checkpointed registers from userspace, with
__get_user(), thus, the only way to avoid this undesired behavior is
delaying the MSR[TS] set.
The 32-bits signal handler seems to be safe this current issue, but, it
might be exposed to the preemption issue, thus, disabling preemption in
this chunk of code.
Changes from v2:
* Run the critical section with preempt_disable.
Fixes: 87b4e5393af7 ("powerpc/tm: Fix return of active 64bit signals")
Cc: stable(a)vger.kernel.org (v3.9+)
Signed-off-by: Breno Leitao <leitao(a)debian.org>
Signed-off-by: Michael Ellerman <mpe(a)ellerman.id.au>
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 9d39e0eb03ff..7484f43493d3 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -848,7 +848,23 @@ static long restore_tm_user_regs(struct pt_regs *regs,
/* If TM bits are set to the reserved value, it's an invalid context */
if (MSR_TM_RESV(msr_hi))
return 1;
- /* Pull in the MSR TM bits from the user context */
+
+ /*
+ * Disabling preemption, since it is unsafe to be preempted
+ * with MSR[TS] set without recheckpointing.
+ */
+ preempt_disable();
+
+ /*
+ * CAUTION:
+ * After regs->MSR[TS] being updated, make sure that get_user(),
+ * put_user() or similar functions are *not* called. These
+ * functions can generate page faults which will cause the process
+ * to be de-scheduled with MSR[TS] set but without calling
+ * tm_recheckpoint(). This can cause a bug.
+ *
+ * Pull in the MSR TM bits from the user context
+ */
regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr_hi & MSR_TS_MASK);
/* Now, recheckpoint. This loads up all of the checkpointed (older)
* registers, including FP and V[S]Rs. After recheckpointing, the
@@ -873,6 +889,8 @@ static long restore_tm_user_regs(struct pt_regs *regs,
}
#endif
+ preempt_enable();
+
return 0;
}
#endif
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index e53ad11be385..ba093ec5a21f 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -467,20 +467,6 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
if (MSR_TM_RESV(msr))
return -EINVAL;
- /* pull in MSR TS bits from user context */
- regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
-
- /*
- * Ensure that TM is enabled in regs->msr before we leave the signal
- * handler. It could be the case that (a) user disabled the TM bit
- * through the manipulation of the MSR bits in uc_mcontext or (b) the
- * TM bit was disabled because a sufficient number of context switches
- * happened whilst in the signal handler and load_tm overflowed,
- * disabling the TM bit. In either case we can end up with an illegal
- * TM state leading to a TM Bad Thing when we return to userspace.
- */
- regs->msr |= MSR_TM;
-
/* pull in MSR LE from user context */
regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
@@ -572,6 +558,34 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
tm_enable();
/* Make sure the transaction is marked as failed */
tsk->thread.tm_texasr |= TEXASR_FS;
+
+ /*
+ * Disabling preemption, since it is unsafe to be preempted
+ * with MSR[TS] set without recheckpointing.
+ */
+ preempt_disable();
+
+ /* pull in MSR TS bits from user context */
+ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+
+ /*
+ * Ensure that TM is enabled in regs->msr before we leave the signal
+ * handler. It could be the case that (a) user disabled the TM bit
+ * through the manipulation of the MSR bits in uc_mcontext or (b) the
+ * TM bit was disabled because a sufficient number of context switches
+ * happened whilst in the signal handler and load_tm overflowed,
+ * disabling the TM bit. In either case we can end up with an illegal
+ * TM state leading to a TM Bad Thing when we return to userspace.
+ *
+ * CAUTION:
+ * After regs->MSR[TS] being updated, make sure that get_user(),
+ * put_user() or similar functions are *not* called. These
+ * functions can generate page faults which will cause the process
+ * to be de-scheduled with MSR[TS] set but without calling
+ * tm_recheckpoint(). This can cause a bug.
+ */
+ regs->msr |= MSR_TM;
+
/* This loads the checkpointed FP/VEC state, if used */
tm_recheckpoint(&tsk->thread);
@@ -585,6 +599,8 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
regs->msr |= MSR_VEC;
}
+ preempt_enable();
+
return err;
}
#endif
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be6821f82c3cc36e026f5afd10249988852b35ea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 11 Dec 2018 10:19:45 +0000
Subject: [PATCH] Btrfs: send, fix race with transaction commits that create
snapshots
If we create a snapshot of a snapshot currently being used by a send
operation, we can end up with send failing unexpectedly (returning
-ENOENT error to user space for example). The following diagram shows
how this happens.
CPU 1 CPU2 CPU3
btrfs_ioctl_send()
(...)
create_snapshot()
-> creates snapshot of a
root used by the send
task
btrfs_commit_transaction()
create_pending_snapshot()
__get_inode_info()
btrfs_search_slot()
btrfs_search_slot_get_root()
down_read commit_root_sem
get reference on eb of the
commit root
-> eb with bytenr == X
up_read commit_root_sem
btrfs_cow_block(root node)
btrfs_free_tree_block()
-> creates delayed ref to
free the extent
btrfs_run_delayed_refs()
-> runs the delayed ref,
adds extent to
fs_info->pinned_extents
btrfs_finish_extent_commit()
unpin_extent_range()
-> marks extent as free
in the free space cache
transaction commit finishes
btrfs_start_transaction()
(...)
btrfs_cow_block()
btrfs_alloc_tree_block()
btrfs_reserve_extent()
-> allocates extent at
bytenr == X
btrfs_init_new_buffer(bytenr X)
btrfs_find_create_tree_block()
alloc_extent_buffer(bytenr X)
find_extent_buffer(bytenr X)
-> returns existing eb,
which the send task got
(...)
-> modifies content of the
eb with bytenr == X
-> uses an eb that now
belongs to some other
tree and no more matches
the commit root of the
snapshot, resuts will be
unpredictable
The consequences of this race can be various, and can lead to searches in
the commit root performed by the send task failing unexpectedly (unable to
find inode items, returning -ENOENT to user space, for example) or not
failing because an inode item with the same number was added to the tree
that reused the metadata extent, in which case send can behave incorrectly
in the worst case or just fail later for some reason.
Fix this by performing a copy of the commit root's extent buffer when doing
a search in the context of a send operation.
CC: stable(a)vger.kernel.org # 4.4.x: 1fc28d8e2e9: Btrfs: move get root out of btrfs_search_slot to a helper
CC: stable(a)vger.kernel.org # 4.4.x: f9ddfd0592a: Btrfs: remove unused check of skip_locking
CC: stable(a)vger.kernel.org # 4.4.x
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 72745235896f..4252e89df6ae 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2587,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
root_lock = BTRFS_READ_LOCK;
if (p->search_commit_root) {
- /* The commit roots are read only so we always do read locks */
- if (p->need_commit_sem)
+ /*
+ * The commit roots are read only so we always do read locks,
+ * and we always must hold the commit_root_sem when doing
+ * searches on them, the only exception is send where we don't
+ * want to block transaction commits for a long time, so
+ * we need to clone the commit root in order to avoid races
+ * with transaction commits that create a snapshot of one of
+ * the roots used by a send operation.
+ */
+ if (p->need_commit_sem) {
down_read(&fs_info->commit_root_sem);
- b = root->commit_root;
- extent_buffer_get(b);
- level = btrfs_header_level(b);
- if (p->need_commit_sem)
+ b = btrfs_clone_extent_buffer(root->commit_root);
up_read(&fs_info->commit_root_sem);
+ if (!b)
+ return ERR_PTR(-ENOMEM);
+
+ } else {
+ b = root->commit_root;
+ extent_buffer_get(b);
+ }
+ level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
* p->search_commit_root = 1.
@@ -2720,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto done;
+ }
while (b) {
level = btrfs_header_level(b);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be6821f82c3cc36e026f5afd10249988852b35ea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 11 Dec 2018 10:19:45 +0000
Subject: [PATCH] Btrfs: send, fix race with transaction commits that create
snapshots
If we create a snapshot of a snapshot currently being used by a send
operation, we can end up with send failing unexpectedly (returning
-ENOENT error to user space for example). The following diagram shows
how this happens.
CPU 1 CPU2 CPU3
btrfs_ioctl_send()
(...)
create_snapshot()
-> creates snapshot of a
root used by the send
task
btrfs_commit_transaction()
create_pending_snapshot()
__get_inode_info()
btrfs_search_slot()
btrfs_search_slot_get_root()
down_read commit_root_sem
get reference on eb of the
commit root
-> eb with bytenr == X
up_read commit_root_sem
btrfs_cow_block(root node)
btrfs_free_tree_block()
-> creates delayed ref to
free the extent
btrfs_run_delayed_refs()
-> runs the delayed ref,
adds extent to
fs_info->pinned_extents
btrfs_finish_extent_commit()
unpin_extent_range()
-> marks extent as free
in the free space cache
transaction commit finishes
btrfs_start_transaction()
(...)
btrfs_cow_block()
btrfs_alloc_tree_block()
btrfs_reserve_extent()
-> allocates extent at
bytenr == X
btrfs_init_new_buffer(bytenr X)
btrfs_find_create_tree_block()
alloc_extent_buffer(bytenr X)
find_extent_buffer(bytenr X)
-> returns existing eb,
which the send task got
(...)
-> modifies content of the
eb with bytenr == X
-> uses an eb that now
belongs to some other
tree and no more matches
the commit root of the
snapshot, resuts will be
unpredictable
The consequences of this race can be various, and can lead to searches in
the commit root performed by the send task failing unexpectedly (unable to
find inode items, returning -ENOENT to user space, for example) or not
failing because an inode item with the same number was added to the tree
that reused the metadata extent, in which case send can behave incorrectly
in the worst case or just fail later for some reason.
Fix this by performing a copy of the commit root's extent buffer when doing
a search in the context of a send operation.
CC: stable(a)vger.kernel.org # 4.4.x: 1fc28d8e2e9: Btrfs: move get root out of btrfs_search_slot to a helper
CC: stable(a)vger.kernel.org # 4.4.x: f9ddfd0592a: Btrfs: remove unused check of skip_locking
CC: stable(a)vger.kernel.org # 4.4.x
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 72745235896f..4252e89df6ae 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2587,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
root_lock = BTRFS_READ_LOCK;
if (p->search_commit_root) {
- /* The commit roots are read only so we always do read locks */
- if (p->need_commit_sem)
+ /*
+ * The commit roots are read only so we always do read locks,
+ * and we always must hold the commit_root_sem when doing
+ * searches on them, the only exception is send where we don't
+ * want to block transaction commits for a long time, so
+ * we need to clone the commit root in order to avoid races
+ * with transaction commits that create a snapshot of one of
+ * the roots used by a send operation.
+ */
+ if (p->need_commit_sem) {
down_read(&fs_info->commit_root_sem);
- b = root->commit_root;
- extent_buffer_get(b);
- level = btrfs_header_level(b);
- if (p->need_commit_sem)
+ b = btrfs_clone_extent_buffer(root->commit_root);
up_read(&fs_info->commit_root_sem);
+ if (!b)
+ return ERR_PTR(-ENOMEM);
+
+ } else {
+ b = root->commit_root;
+ extent_buffer_get(b);
+ }
+ level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
* p->search_commit_root = 1.
@@ -2720,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto done;
+ }
while (b) {
level = btrfs_header_level(b);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be6821f82c3cc36e026f5afd10249988852b35ea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 11 Dec 2018 10:19:45 +0000
Subject: [PATCH] Btrfs: send, fix race with transaction commits that create
snapshots
If we create a snapshot of a snapshot currently being used by a send
operation, we can end up with send failing unexpectedly (returning
-ENOENT error to user space for example). The following diagram shows
how this happens.
CPU 1 CPU2 CPU3
btrfs_ioctl_send()
(...)
create_snapshot()
-> creates snapshot of a
root used by the send
task
btrfs_commit_transaction()
create_pending_snapshot()
__get_inode_info()
btrfs_search_slot()
btrfs_search_slot_get_root()
down_read commit_root_sem
get reference on eb of the
commit root
-> eb with bytenr == X
up_read commit_root_sem
btrfs_cow_block(root node)
btrfs_free_tree_block()
-> creates delayed ref to
free the extent
btrfs_run_delayed_refs()
-> runs the delayed ref,
adds extent to
fs_info->pinned_extents
btrfs_finish_extent_commit()
unpin_extent_range()
-> marks extent as free
in the free space cache
transaction commit finishes
btrfs_start_transaction()
(...)
btrfs_cow_block()
btrfs_alloc_tree_block()
btrfs_reserve_extent()
-> allocates extent at
bytenr == X
btrfs_init_new_buffer(bytenr X)
btrfs_find_create_tree_block()
alloc_extent_buffer(bytenr X)
find_extent_buffer(bytenr X)
-> returns existing eb,
which the send task got
(...)
-> modifies content of the
eb with bytenr == X
-> uses an eb that now
belongs to some other
tree and no more matches
the commit root of the
snapshot, resuts will be
unpredictable
The consequences of this race can be various, and can lead to searches in
the commit root performed by the send task failing unexpectedly (unable to
find inode items, returning -ENOENT to user space, for example) or not
failing because an inode item with the same number was added to the tree
that reused the metadata extent, in which case send can behave incorrectly
in the worst case or just fail later for some reason.
Fix this by performing a copy of the commit root's extent buffer when doing
a search in the context of a send operation.
CC: stable(a)vger.kernel.org # 4.4.x: 1fc28d8e2e9: Btrfs: move get root out of btrfs_search_slot to a helper
CC: stable(a)vger.kernel.org # 4.4.x: f9ddfd0592a: Btrfs: remove unused check of skip_locking
CC: stable(a)vger.kernel.org # 4.4.x
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 72745235896f..4252e89df6ae 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2587,14 +2587,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
root_lock = BTRFS_READ_LOCK;
if (p->search_commit_root) {
- /* The commit roots are read only so we always do read locks */
- if (p->need_commit_sem)
+ /*
+ * The commit roots are read only so we always do read locks,
+ * and we always must hold the commit_root_sem when doing
+ * searches on them, the only exception is send where we don't
+ * want to block transaction commits for a long time, so
+ * we need to clone the commit root in order to avoid races
+ * with transaction commits that create a snapshot of one of
+ * the roots used by a send operation.
+ */
+ if (p->need_commit_sem) {
down_read(&fs_info->commit_root_sem);
- b = root->commit_root;
- extent_buffer_get(b);
- level = btrfs_header_level(b);
- if (p->need_commit_sem)
+ b = btrfs_clone_extent_buffer(root->commit_root);
up_read(&fs_info->commit_root_sem);
+ if (!b)
+ return ERR_PTR(-ENOMEM);
+
+ } else {
+ b = root->commit_root;
+ extent_buffer_get(b);
+ }
+ level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
* p->search_commit_root = 1.
@@ -2720,6 +2733,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
again:
prev_cmp = -1;
b = btrfs_search_slot_get_root(root, p, write_lock_level);
+ if (IS_ERR(b)) {
+ ret = PTR_ERR(b);
+ goto done;
+ }
while (b) {
level = btrfs_header_level(b);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d1051d6ebf8ef3517a5a3cf82bba8436d190f1c2 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Wed, 21 Nov 2018 17:10:52 +0200
Subject: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Running btrfs/124 in a loop hung up on me sporadically with the
following call trace:
btrfs D 0 5760 5324 0x00000000
Call Trace:
? __schedule+0x243/0x800
schedule+0x33/0x90
btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
? wait_woken+0xa0/0xa0
btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
btrfs_relocate_chunk+0x49/0x100 [btrfs]
btrfs_balance+0xbeb/0x1740 [btrfs]
btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
btrfs_ioctl+0x1691/0x3110 [btrfs]
? lockdep_hardirqs_on+0xed/0x180
? __handle_mm_fault+0x8e7/0xfb0
? _raw_spin_unlock+0x24/0x30
? __handle_mm_fault+0x8e7/0xfb0
? do_vfs_ioctl+0xa5/0x6e0
? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
do_vfs_ioctl+0xa5/0x6e0
? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
ksys_ioctl+0x3a/0x70
__x64_sys_ioctl+0x16/0x20
do_syscall_64+0x60/0x1b0
entry_SYSCALL_64_after_hwframe+0x49/0xbe
This happens because during page writeback it's valid for
writepage_delalloc to instantiate a delalloc range which doesn't belong
to the page currently being written back.
The reason this case is valid is due to find_lock_delalloc_range
returning any available range after the passed delalloc_start and
ignoring whether the page under writeback is within that range.
In turn ordered extents (OE) are always created for the returned range
from find_lock_delalloc_range. If, however, a failure occurs while OE
are being created then the clean up code in btrfs_cleanup_ordered_extents
will be called.
Unfortunately the code in btrfs_cleanup_ordered_extents doesn't consider
the case of such 'foreign' range being processed and instead it always
assumes that the range OE are created for belongs to the page. This
leads to the first page of such foregin range to not be cleaned up since
it's deliberately missed and skipped by the current cleaning up code.
Fix this by correctly checking whether the current page belongs to the
range being instantiated and if so adjsut the range parameters passed
for cleaning up. If it doesn't, then just clean the whole OE range
directly.
Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e7b97c699acf..e1451a69432b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -110,17 +110,17 @@ static void __endio_write_update_ordered(struct inode *inode,
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of
- * btrfs_run_delalloc_range already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
- const u64 offset,
- const u64 bytes)
+ struct page *locked_page,
+ u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ u64 page_start = page_offset(locked_page);
+ u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
while (index <= end_index) {
@@ -131,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
- return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
- bytes - PAGE_SIZE, false);
+
+ /*
+ * In case this page belongs to the delalloc range being instantiated
+ * then skip it, since the first page of a range is going to be
+ * properly cleaned up by the caller of run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ offset += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
+ }
+
+ return __endio_write_update_ordered(inode, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -1603,7 +1613,8 @@ int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
write_flags);
}
if (ret)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ end - start + 1);
return ret;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d1051d6ebf8ef3517a5a3cf82bba8436d190f1c2 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Wed, 21 Nov 2018 17:10:52 +0200
Subject: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Running btrfs/124 in a loop hung up on me sporadically with the
following call trace:
btrfs D 0 5760 5324 0x00000000
Call Trace:
? __schedule+0x243/0x800
schedule+0x33/0x90
btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
? wait_woken+0xa0/0xa0
btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
btrfs_relocate_chunk+0x49/0x100 [btrfs]
btrfs_balance+0xbeb/0x1740 [btrfs]
btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
btrfs_ioctl+0x1691/0x3110 [btrfs]
? lockdep_hardirqs_on+0xed/0x180
? __handle_mm_fault+0x8e7/0xfb0
? _raw_spin_unlock+0x24/0x30
? __handle_mm_fault+0x8e7/0xfb0
? do_vfs_ioctl+0xa5/0x6e0
? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
do_vfs_ioctl+0xa5/0x6e0
? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
ksys_ioctl+0x3a/0x70
__x64_sys_ioctl+0x16/0x20
do_syscall_64+0x60/0x1b0
entry_SYSCALL_64_after_hwframe+0x49/0xbe
This happens because during page writeback it's valid for
writepage_delalloc to instantiate a delalloc range which doesn't belong
to the page currently being written back.
The reason this case is valid is due to find_lock_delalloc_range
returning any available range after the passed delalloc_start and
ignoring whether the page under writeback is within that range.
In turn ordered extents (OE) are always created for the returned range
from find_lock_delalloc_range. If, however, a failure occurs while OE
are being created then the clean up code in btrfs_cleanup_ordered_extents
will be called.
Unfortunately the code in btrfs_cleanup_ordered_extents doesn't consider
the case of such 'foreign' range being processed and instead it always
assumes that the range OE are created for belongs to the page. This
leads to the first page of such foregin range to not be cleaned up since
it's deliberately missed and skipped by the current cleaning up code.
Fix this by correctly checking whether the current page belongs to the
range being instantiated and if so adjsut the range parameters passed
for cleaning up. If it doesn't, then just clean the whole OE range
directly.
Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e7b97c699acf..e1451a69432b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -110,17 +110,17 @@ static void __endio_write_update_ordered(struct inode *inode,
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of
- * btrfs_run_delalloc_range already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
- const u64 offset,
- const u64 bytes)
+ struct page *locked_page,
+ u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ u64 page_start = page_offset(locked_page);
+ u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
while (index <= end_index) {
@@ -131,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
- return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
- bytes - PAGE_SIZE, false);
+
+ /*
+ * In case this page belongs to the delalloc range being instantiated
+ * then skip it, since the first page of a range is going to be
+ * properly cleaned up by the caller of run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ offset += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
+ }
+
+ return __endio_write_update_ordered(inode, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -1603,7 +1613,8 @@ int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
write_flags);
}
if (ret)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ end - start + 1);
return ret;
}
The patch below does not apply to the 4.20-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From d1051d6ebf8ef3517a5a3cf82bba8436d190f1c2 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Wed, 21 Nov 2018 17:10:52 +0200
Subject: [PATCH] btrfs: Fix error handling in btrfs_cleanup_ordered_extents
Running btrfs/124 in a loop hung up on me sporadically with the
following call trace:
btrfs D 0 5760 5324 0x00000000
Call Trace:
? __schedule+0x243/0x800
schedule+0x33/0x90
btrfs_start_ordered_extent+0x10c/0x1b0 [btrfs]
? wait_woken+0xa0/0xa0
btrfs_wait_ordered_range+0xbb/0x100 [btrfs]
btrfs_relocate_block_group+0x1ff/0x230 [btrfs]
btrfs_relocate_chunk+0x49/0x100 [btrfs]
btrfs_balance+0xbeb/0x1740 [btrfs]
btrfs_ioctl_balance+0x2ee/0x380 [btrfs]
btrfs_ioctl+0x1691/0x3110 [btrfs]
? lockdep_hardirqs_on+0xed/0x180
? __handle_mm_fault+0x8e7/0xfb0
? _raw_spin_unlock+0x24/0x30
? __handle_mm_fault+0x8e7/0xfb0
? do_vfs_ioctl+0xa5/0x6e0
? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
do_vfs_ioctl+0xa5/0x6e0
? entry_SYSCALL_64_after_hwframe+0x3e/0xbe
ksys_ioctl+0x3a/0x70
__x64_sys_ioctl+0x16/0x20
do_syscall_64+0x60/0x1b0
entry_SYSCALL_64_after_hwframe+0x49/0xbe
This happens because during page writeback it's valid for
writepage_delalloc to instantiate a delalloc range which doesn't belong
to the page currently being written back.
The reason this case is valid is due to find_lock_delalloc_range
returning any available range after the passed delalloc_start and
ignoring whether the page under writeback is within that range.
In turn ordered extents (OE) are always created for the returned range
from find_lock_delalloc_range. If, however, a failure occurs while OE
are being created then the clean up code in btrfs_cleanup_ordered_extents
will be called.
Unfortunately the code in btrfs_cleanup_ordered_extents doesn't consider
the case of such 'foreign' range being processed and instead it always
assumes that the range OE are created for belongs to the page. This
leads to the first page of such foregin range to not be cleaned up since
it's deliberately missed and skipped by the current cleaning up code.
Fix this by correctly checking whether the current page belongs to the
range being instantiated and if so adjsut the range parameters passed
for cleaning up. If it doesn't, then just clean the whole OE range
directly.
Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
CC: stable(a)vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e7b97c699acf..e1451a69432b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -110,17 +110,17 @@ static void __endio_write_update_ordered(struct inode *inode,
* extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
* and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
* to be released, which we want to happen only when finishing the ordered
- * extent (btrfs_finish_ordered_io()). Also note that the caller of
- * btrfs_run_delalloc_range already does proper cleanup for the first page of
- * the range, that is, it invokes the callback writepage_end_io_hook() for the
- * range of the first page.
+ * extent (btrfs_finish_ordered_io()).
*/
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
- const u64 offset,
- const u64 bytes)
+ struct page *locked_page,
+ u64 offset, u64 bytes)
{
unsigned long index = offset >> PAGE_SHIFT;
unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ u64 page_start = page_offset(locked_page);
+ u64 page_end = page_start + PAGE_SIZE - 1;
+
struct page *page;
while (index <= end_index) {
@@ -131,8 +131,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
ClearPagePrivate2(page);
put_page(page);
}
- return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
- bytes - PAGE_SIZE, false);
+
+ /*
+ * In case this page belongs to the delalloc range being instantiated
+ * then skip it, since the first page of a range is going to be
+ * properly cleaned up by the caller of run_delalloc_range
+ */
+ if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+ offset += PAGE_SIZE;
+ bytes -= PAGE_SIZE;
+ }
+
+ return __endio_write_update_ordered(inode, offset, bytes, false);
}
static int btrfs_dirty_inode(struct inode *inode);
@@ -1603,7 +1613,8 @@ int btrfs_run_delalloc_range(void *private_data, struct page *locked_page,
write_flags);
}
if (ret)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, locked_page, start,
+ end - start + 1);
return ret;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a5fb11429167ee6ddeeacc554efaf5776b36433a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 26 Nov 2018 20:07:17 +0000
Subject: [PATCH] Btrfs: fix deadlock with memory reclaim during scrub
When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL from scrub,
otherwise we risk getting into a deadlock with reclaim.
Checking for scrub pause requests is done early at the beginning of the
while loop of scrub_stripe() and later in the loop, scrub_extent() and
scrub_raid56_parity() are called, which in turn call scrub_pages() and
scrub_pages_for_parity() respectively. These last two functions do memory
allocations using GFP_KERNEL. Same problem could happen while scrubbing
the super blocks, since it calls scrub_pages().
We also can not have any of the worker tasks, created by the scrub task,
doing GFP_KERNEL allocations, because before pausing, the scrub task waits
for all the worker tasks to complete (also done at scrub_stripe()).
So make sure GFP_NOFS is used for the memory allocations because at any
time a scrub pause request can happen from another task that started to
commit a transaction.
Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
CC: stable(a)vger.kernel.org # 4.6+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..bbd1b36f4918 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -322,6 +322,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
+ unsigned int nofs_flag;
lockdep_assert_held(&locks_root->lock);
@@ -339,8 +340,17 @@ static struct full_stripe_lock *insert_full_stripe_lock(
}
}
- /* Insert new lock */
+ /*
+ * Insert new lock.
+ *
+ * We must use GFP_NOFS because the scrub task might be waiting for a
+ * worker task executing this function and in turn a transaction commit
+ * might be waiting the scrub task to pause (which needs to wait for all
+ * the worker tasks to complete before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
@@ -1620,8 +1630,19 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
mutex_lock(&sctx->wr_lock);
again:
if (!sctx->wr_curr_bio) {
+ unsigned int nofs_flag;
+
+ /*
+ * We must use GFP_NOFS because the scrub task might be waiting
+ * for a worker task executing this function and in turn a
+ * transaction commit might be waiting the scrub task to pause
+ * (which needs to wait for all the worker tasks to complete
+ * before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!sctx->wr_curr_bio) {
mutex_unlock(&sctx->wr_lock);
return -ENOMEM;
@@ -3772,6 +3793,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3875,6 +3897,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
+ /*
+ * In order to avoid deadlock with reclaim when there is a transaction
+ * trying to pause scrub, make sure we use GFP_NOFS for all the
+ * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * invoked by our callees. The pausing request is done when the
+ * transaction commit starts, and it blocks the transaction until scrub
+ * is paused (done at specific points at scrub_stripe() or right above
+ * before incrementing fs_info->scrubs_running).
+ */
+ nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
/*
* by holding device list mutex, we can
@@ -3887,6 +3919,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ memalloc_nofs_restore(nofs_flag);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From a5fb11429167ee6ddeeacc554efaf5776b36433a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 26 Nov 2018 20:07:17 +0000
Subject: [PATCH] Btrfs: fix deadlock with memory reclaim during scrub
When a transaction commit starts, it attempts to pause scrub and it blocks
until the scrub is paused. So while the transaction is blocked waiting for
scrub to pause, we can not do memory allocation with GFP_KERNEL from scrub,
otherwise we risk getting into a deadlock with reclaim.
Checking for scrub pause requests is done early at the beginning of the
while loop of scrub_stripe() and later in the loop, scrub_extent() and
scrub_raid56_parity() are called, which in turn call scrub_pages() and
scrub_pages_for_parity() respectively. These last two functions do memory
allocations using GFP_KERNEL. Same problem could happen while scrubbing
the super blocks, since it calls scrub_pages().
We also can not have any of the worker tasks, created by the scrub task,
doing GFP_KERNEL allocations, because before pausing, the scrub task waits
for all the worker tasks to complete (also done at scrub_stripe()).
So make sure GFP_NOFS is used for the memory allocations because at any
time a scrub pause request can happen from another task that started to
commit a transaction.
Fixes: 58c4e173847a ("btrfs: scrub: use GFP_KERNEL on the submission path")
CC: stable(a)vger.kernel.org # 4.6+
Reviewed-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 902819d3cf41..bbd1b36f4918 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -322,6 +322,7 @@ static struct full_stripe_lock *insert_full_stripe_lock(
struct rb_node *parent = NULL;
struct full_stripe_lock *entry;
struct full_stripe_lock *ret;
+ unsigned int nofs_flag;
lockdep_assert_held(&locks_root->lock);
@@ -339,8 +340,17 @@ static struct full_stripe_lock *insert_full_stripe_lock(
}
}
- /* Insert new lock */
+ /*
+ * Insert new lock.
+ *
+ * We must use GFP_NOFS because the scrub task might be waiting for a
+ * worker task executing this function and in turn a transaction commit
+ * might be waiting the scrub task to pause (which needs to wait for all
+ * the worker tasks to complete before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!ret)
return ERR_PTR(-ENOMEM);
ret->logical = fstripe_logical;
@@ -1620,8 +1630,19 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
mutex_lock(&sctx->wr_lock);
again:
if (!sctx->wr_curr_bio) {
+ unsigned int nofs_flag;
+
+ /*
+ * We must use GFP_NOFS because the scrub task might be waiting
+ * for a worker task executing this function and in turn a
+ * transaction commit might be waiting the scrub task to pause
+ * (which needs to wait for all the worker tasks to complete
+ * before pausing).
+ */
+ nofs_flag = memalloc_nofs_save();
sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
GFP_KERNEL);
+ memalloc_nofs_restore(nofs_flag);
if (!sctx->wr_curr_bio) {
mutex_unlock(&sctx->wr_lock);
return -ENOMEM;
@@ -3772,6 +3793,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
struct scrub_ctx *sctx;
int ret;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
if (btrfs_fs_closing(fs_info))
return -EINVAL;
@@ -3875,6 +3897,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
+ /*
+ * In order to avoid deadlock with reclaim when there is a transaction
+ * trying to pause scrub, make sure we use GFP_NOFS for all the
+ * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
+ * invoked by our callees. The pausing request is done when the
+ * transaction commit starts, and it blocks the transaction until scrub
+ * is paused (done at specific points at scrub_stripe() or right above
+ * before incrementing fs_info->scrubs_running).
+ */
+ nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
/*
* by holding device list mutex, we can
@@ -3887,6 +3919,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!ret)
ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ memalloc_nofs_restore(nofs_flag);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0d228ece59a35a9b9e8ff0d40653234a6d90f61e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain(a)oracle.com>
Date: Sun, 11 Nov 2018 22:22:17 +0800
Subject: [PATCH] btrfs: dev-replace: go back to suspended state if target
device is missing
At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and expect the target device is missing.
Then let the replace state continue to be in
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub
running as part of replace.
Fixes: e93c89c1aaaa ("Btrfs: add new sources for device replace code")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain(a)oracle.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 32da6901dc88..11df8f778b63 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -890,6 +890,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 0d228ece59a35a9b9e8ff0d40653234a6d90f61e Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain(a)oracle.com>
Date: Sun, 11 Nov 2018 22:22:17 +0800
Subject: [PATCH] btrfs: dev-replace: go back to suspended state if target
device is missing
At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and expect the target device is missing.
Then let the replace state continue to be in
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub
running as part of replace.
Fixes: e93c89c1aaaa ("Btrfs: add new sources for device replace code")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain(a)oracle.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 32da6901dc88..11df8f778b63 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -890,6 +890,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
btrfs_dev_replace_write_unlock(dev_replace);
return 0;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 65b6657672388b72822e0367f06d41c1e3ffb5bb Mon Sep 17 00:00:00 2001
From: Jernej Skrabec <jernej.skrabec(a)siol.net>
Date: Sun, 4 Nov 2018 19:26:40 +0100
Subject: [PATCH] clk: sunxi-ng: Use u64 for calculation of NM rate
Allwinner H6 SoC has multiplier N range between 1 and 254. Since parent
rate is 24MHz, intermediate result when calculating final rate easily
overflows 32 bit variable.
Because of that, introduce function for calculating clock rate which
uses 64 bit variable for intermediate result.
Fixes: 6174a1e24b0d ("clk: sunxi-ng: Add N-M-factor clock support")
Fixes: ee28648cb2b4 ("clk: sunxi-ng: Remove the use of rational computations")
CC: <stable(a)vger.kernel.org>
Signed-off-by: Jernej Skrabec <jernej.skrabec(a)siol.net>
Signed-off-by: Maxime Ripard <maxime.ripard(a)bootlin.com>
diff --git a/drivers/clk/sunxi-ng/ccu_nm.c b/drivers/clk/sunxi-ng/ccu_nm.c
index 6fe3c14f7b2d..424d8635b053 100644
--- a/drivers/clk/sunxi-ng/ccu_nm.c
+++ b/drivers/clk/sunxi-ng/ccu_nm.c
@@ -19,6 +19,17 @@ struct _ccu_nm {
unsigned long m, min_m, max_m;
};
+static unsigned long ccu_nm_calc_rate(unsigned long parent,
+ unsigned long n, unsigned long m)
+{
+ u64 rate = parent;
+
+ rate *= n;
+ do_div(rate, m);
+
+ return rate;
+}
+
static void ccu_nm_find_best(unsigned long parent, unsigned long rate,
struct _ccu_nm *nm)
{
@@ -28,7 +39,8 @@ static void ccu_nm_find_best(unsigned long parent, unsigned long rate,
for (_n = nm->min_n; _n <= nm->max_n; _n++) {
for (_m = nm->min_m; _m <= nm->max_m; _m++) {
- unsigned long tmp_rate = parent * _n / _m;
+ unsigned long tmp_rate = ccu_nm_calc_rate(parent,
+ _n, _m);
if (tmp_rate > rate)
continue;
@@ -100,7 +112,7 @@ static unsigned long ccu_nm_recalc_rate(struct clk_hw *hw,
if (ccu_sdm_helper_is_enabled(&nm->common, &nm->sdm))
rate = ccu_sdm_helper_read_rate(&nm->common, &nm->sdm, m, n);
else
- rate = parent_rate * n / m;
+ rate = ccu_nm_calc_rate(parent_rate, n, m);
if (nm->common.features & CCU_FEATURE_FIXED_POSTDIV)
rate /= nm->fixed_post_div;
@@ -149,7 +161,7 @@ static long ccu_nm_round_rate(struct clk_hw *hw, unsigned long rate,
_nm.max_m = nm->m.max ?: 1 << nm->m.width;
ccu_nm_find_best(*parent_rate, rate, &_nm);
- rate = *parent_rate * _nm.n / _nm.m;
+ rate = ccu_nm_calc_rate(*parent_rate, _nm.n, _nm.m);
if (nm->common.features & CCU_FEATURE_FIXED_POSTDIV)
rate /= nm->fixed_post_div;