The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 764baba80168ad3adafb521d2ab483ccbc49e344 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il(a)gmail.com>
Date: Sun, 4 Feb 2018 15:35:09 +0200
Subject: [PATCH] ovl: hash non-dir by lower inode for fsnotify
Commit 31747eda41ef ("ovl: hash directory inodes for fsnotify")
fixed an issue of inotify watch on directory that stops getting
events after dropping dentry caches.
A similar issue exists for non-dir non-upper files, for example:
$ mkdir -p lower upper work merged
$ touch lower/foo
$ mount -t overlay -o
lowerdir=lower,workdir=work,upperdir=upper none merged
$ inotifywait merged/foo &
$ echo 2 > /proc/sys/vm/drop_caches
$ cat merged/foo
inotifywait doesn't get the OPEN event, because ovl_lookup() called
from 'cat' allocates a new overlay inode and does not reuse the
watched inode.
Fix this by hashing non-dir overlay inodes by lower real inode in
the following cases that were not hashed before this change:
- A non-upper overlay mount
- A lower non-hardlink when index=off
A helper ovl_hash_bylower() was added to put all the logic and
documentation about which real inode an overlay inode is hashed by
into one place.
The issue dates back to initial version of overlayfs, but this
patch depends on ovl_inode code that was introduced in kernel v4.13.
Cc: <stable(a)vger.kernel.org> #v4.13
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi(a)redhat.com>
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index fcd97b783fa1..3b1bd469accd 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -669,38 +669,59 @@ struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
return inode;
}
+/*
+ * Does overlay inode need to be hashed by lower inode?
+ */
+static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
+ struct dentry *lower, struct dentry *index)
+{
+ struct ovl_fs *ofs = sb->s_fs_info;
+
+ /* No, if pure upper */
+ if (!lower)
+ return false;
+
+ /* Yes, if already indexed */
+ if (index)
+ return true;
+
+ /* Yes, if won't be copied up */
+ if (!ofs->upper_mnt)
+ return true;
+
+ /* No, if lower hardlink is or will be broken on copy up */
+ if ((upper || !ovl_indexdir(sb)) &&
+ !d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
+ return false;
+
+ /* No, if non-indexed upper with NFS export */
+ if (sb->s_export_op && upper)
+ return false;
+
+ /* Otherwise, hash by lower inode for fsnotify */
+ return true;
+}
+
struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
struct dentry *lowerdentry, struct dentry *index,
unsigned int numlower)
{
- struct ovl_fs *ofs = sb->s_fs_info;
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
- /* Already indexed or could be indexed on copy up? */
- bool indexed = (index || (ovl_indexdir(sb) && !upperdentry));
- struct dentry *origin = indexed ? lowerdentry : NULL;
+ bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index);
bool is_dir;
- if (WARN_ON(upperdentry && indexed && !lowerdentry))
- return ERR_PTR(-EIO);
-
if (!realinode)
realinode = d_inode(lowerdentry);
/*
- * Copy up origin (lower) may exist for non-indexed non-dir upper, but
- * we must not use lower as hash key in that case.
- * Hash non-dir that is or could be indexed by origin inode.
- * Hash dir that is or could be merged by origin inode.
- * Hash pure upper and non-indexed non-dir by upper inode.
- * Hash non-indexed dir by upper inode for NFS export.
+ * Copy up origin (lower) may exist for non-indexed upper, but we must
+ * not use lower as hash key if this is a broken hardlink.
*/
is_dir = S_ISDIR(realinode->i_mode);
- if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt))
- origin = lowerdentry;
-
- if (upperdentry || origin) {
- struct inode *key = d_inode(origin ?: upperdentry);
+ if (upperdentry || bylower) {
+ struct inode *key = d_inode(bylower ? lowerdentry :
+ upperdentry);
unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
inode = iget5_locked(sb, (unsigned long) key,
@@ -728,6 +749,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
set_nlink(inode, nlink);
} else {
+ /* Lower hardlink that will be broken on copy up */
inode = new_inode(sb);
if (!inode)
goto out_nomem;
According to the official documentation for HFS+ [1], inode timestamps
are supposed to cover the time range from 1904 to 2040 as originally
used in classic MacOS.
The traditional Linux usage is to convert the timestamps into an unsigned
32-bit number based on the Unix epoch and from there to a time_t. On
32-bit systems, that wraps the time from 2038 to 1902, so the last
two years of the valid time range become garbled. On 64-bit systems,
all times before 1970 get turned into timestamps between 2038 and 2106,
which is more convenient but also different from the documented behavior.
The same behavior is used in Darwin and presumaby all versions of MacOS X,
as seen in the to_hfs_time() function in [2]. It is unclear whether this
is a bug in the file system code, or intentional but undocumented behavior.
This changes Linux over to the traditional MacOS (pre MacOS X)
behavior. This means all files that are created on MacOS X or Linux
with future timestamps between 2040 and 2106 will now show up as past
dates. Timestamps between 2038 and 2040 will still be represented
incorrectly on 32-bit architectures as times between 1902 and 1904,
but that will be fixed once we have user space with 64-bit time_t.
Cc: stable(a)vger.kernel.org
Link: [1] https://developer.apple.com/library/archive/technotes/tn/tn1150.html
Link: [2] https://opensource.apple.com/source/xnu/xnu-344/bsd/hfs/MacOSStubs.c
Suggested-by: Viacheslav Dubeyko <slava(a)dubeyko.com>
Signed-off-by: Arnd Bergmann <arnd(a)arndb.de>
---
Note: This is the patch that Viacheslav asked for, but given how
MacOS X behaves, I'm increasingly thinking this is a bad idea.
---
fs/hfs/hfs_fs.h | 2 +-
fs/hfsplus/hfsplus_fs.h | 5 +++--
2 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 6d0783e2e276..39c1f3a43ed8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -247,7 +247,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb);
*
*/
#define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60)
-#define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60)
+#define __hfs_m_to_utime(sec) ((time64_t)be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60)
#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode))
#define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info)
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d9255abafb81..57838ef4dcdc 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -530,8 +530,9 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf,
void **data, int op, int op_flags);
int hfsplus_read_wrapper(struct super_block *sb);
-/* time macros */
-#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U)
+/* time macros: convert between 1904-2040 and 1970-2106 range,
+ * pre-1970 timestamps are interpreted as post-2038 times after wrap-around */
+#define __hfsp_mt2ut(t) ((time64_t)be32_to_cpu(t) - 2082844800U)
#define __hfsp_ut2mt(t) (cpu_to_be32(t + 2082844800U))
/* compatibility */
--
2.9.0
From: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
Date: Sun, 8 Apr 2018 23:35:28 +0200
commit 9564a8cf422d7b58f6e857e3546d346fa970191e upstream.
I tried building using a freshly built Make (4.2.1-69-g8a731d1), but
already the objtool build broke with
orc_dump.c: In function ‘orc_dump’:
orc_dump.c:106:2: error: ‘elf_getshnum’ is deprecated [-Werror=deprecated-declarations]
if (elf_getshdrnum(elf, &nr_sections)) {
Turns out that with that new Make, the backslash was not removed, so cpp
didn't see a #include directive, grep found nothing, and
-DLIBELF_USE_DEPRECATED was wrongly put in CFLAGS.
Now, that new Make behaviour is documented in their NEWS file:
* WARNING: Backward-incompatibility!
Number signs (#) appearing inside a macro reference or function invocation
no longer introduce comments and should not be escaped with backslashes:
thus a call such as:
foo := $(shell echo '#')
is legal. Previously the number sign needed to be escaped, for example:
foo := $(shell echo '\#')
Now this latter will resolve to "\#". If you want to write makefiles
portable to both versions, assign the number sign to a variable:
C := \#
foo := $(shell echo '$C')
This was claimed to be fixed in 3.81, but wasn't, for some reason.
To detect this change search for 'nocomment' in the .FEATURES variable.
This also fixes up the two make-cmd instances to replace # with $(pound)
rather than with \#. There might very well be other places that need
similar fixup in preparation for whatever future Make release contains
the above change, but at least this builds an x86_64 defconfig with the
new make.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=197847
Cc: Randy Dunlap <rdunlap(a)infradead.org>
Signed-off-by: Rasmus Villemoes <linux(a)rasmusvillemoes.dk>
Signed-off-by: Masahiro Yamada <yamada.masahiro(a)socionext.com>
Signed-off-by: Paul Menzel <pmenzel(a)molgen.mpg.de>
---
Fix one conflict in `scripts/Kbuild.include`.
scripts/Kbuild.include | 5 +++--
tools/build/Build.include | 5 +++--
tools/objtool/Makefile | 2 +-
tools/scripts/Makefile.include | 2 ++
4 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 97769465de13..fcbbecf92395 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -8,6 +8,7 @@ squote := '
empty :=
space := $(empty) $(empty)
space_escape := _-_SPACE_-_
+pound := \#
###
# Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
@@ -251,11 +252,11 @@ endif
# Replace >$< with >$$< to preserve $ when reloading the .cmd file
# (needed for make)
-# Replace >#< with >\#< to avoid starting a comment in the .cmd file
+# Replace >#< with >$(pound)< to avoid starting a comment in the .cmd file
# (needed for make)
# Replace >'< with >'\''< to be able to enclose the whole string in '...'
# (needed for the shell)
-make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1)))))
+make-cmd = $(call escsq,$(subst $(pound),$$(pound),$(subst $$,$$$$,$(cmd_$(1)))))
# Find any prerequisites that is newer than target or that does not exist.
# PHONY targets skipped in both cases.
diff --git a/tools/build/Build.include b/tools/build/Build.include
index 418871d02ebf..a4bbb984941d 100644
--- a/tools/build/Build.include
+++ b/tools/build/Build.include
@@ -12,6 +12,7 @@
# Convenient variables
comma := ,
squote := '
+pound := \#
###
# Name of target with a '.' as filename prefix. foo/bar.o => foo/.bar.o
@@ -43,11 +44,11 @@ echo-cmd = $(if $($(quiet)cmd_$(1)),\
###
# Replace >$< with >$$< to preserve $ when reloading the .cmd file
# (needed for make)
-# Replace >#< with >\#< to avoid starting a comment in the .cmd file
+# Replace >#< with >$(pound)< to avoid starting a comment in the .cmd file
# (needed for make)
# Replace >'< with >'\''< to be able to enclose the whole string in '...'
# (needed for the shell)
-make-cmd = $(call escsq,$(subst \#,\\\#,$(subst $$,$$$$,$(cmd_$(1)))))
+make-cmd = $(call escsq,$(subst $(pound),$$(pound),$(subst $$,$$$$,$(cmd_$(1)))))
###
# Find any prerequisites that is newer than target or that does not exist.
diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile
index e6acc281dd37..8ae824dbfca3 100644
--- a/tools/objtool/Makefile
+++ b/tools/objtool/Makefile
@@ -35,7 +35,7 @@ CFLAGS += -Wall -Werror $(WARNINGS) -fomit-frame-pointer -O2 -g $(INCLUDES)
LDFLAGS += -lelf $(LIBSUBCMD)
# Allow old libelf to be used:
-elfshdr := $(shell echo '\#include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)
+elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(CC) $(CFLAGS) -x c -E - | grep elf_getshdr)
CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED)
AWK = awk
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index 654efd9768fd..5f3f1f44ed0a 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -101,3 +101,5 @@ ifneq ($(silent),1)
QUIET_INSTALL = @printf ' INSTALL %s\n' $1;
endif
endif
+
+pound := \#
--
2.17.1
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7810e6781e0fcbca78b91cf65053f895bf59e85f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka(a)suse.cz>
Date: Thu, 7 Jun 2018 17:09:29 -0700
Subject: [PATCH] mm, page_alloc: do not break __GFP_THISNODE by zonelist reset
In __alloc_pages_slowpath() we reset zonelist and preferred_zoneref for
allocations that can ignore memory policies. The zonelist is obtained
from current CPU's node. This is a problem for __GFP_THISNODE
allocations that want to allocate on a different node, e.g. because the
allocating thread has been migrated to a different CPU.
This has been observed to break SLAB in our 4.4-based kernel, because
there it relies on __GFP_THISNODE working as intended. If a slab page
is put on wrong node's list, then further list manipulations may corrupt
the list because page_to_nid() is used to determine which node's
list_lock should be locked and thus we may take a wrong lock and race.
Current SLAB implementation seems to be immune by luck thanks to commit
511e3a058812 ("mm/slab: make cache_grow() handle the page allocated on
arbitrary node") but there may be others assuming that __GFP_THISNODE
works as promised.
We can fix it by simply removing the zonelist reset completely. There
is actually no reason to reset it, because memory policies and cpusets
don't affect the zonelist choice in the first place. This was different
when commit 183f6371aac2 ("mm: ignore mempolicies when using
ALLOC_NO_WATERMARK") introduced the code, as mempolicies provided their
own restricted zonelists.
We might consider this for 4.17 although I don't know if there's
anything currently broken.
SLAB is currently not affected, but in kernels older than 4.7 that don't
yet have 511e3a058812 ("mm/slab: make cache_grow() handle the page
allocated on arbitrary node") it is. That's at least 4.4 LTS. Older
ones I'll have to check.
So stable backports should be more important, but will have to be
reviewed carefully, as the code went through many changes. BTW I think
that also the ac->preferred_zoneref reset is currently useless if we
don't also reset ac->nodemask from a mempolicy to NULL first (which we
probably should for the OOM victims etc?), but I would leave that for a
separate patch.
Link: http://lkml.kernel.org/r/20180525130853.13915-1-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Fixes: 183f6371aac2 ("mm: ignore mempolicies when using ALLOC_NO_WATERMARK")
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef1811531999..07b3c23762ad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4169,7 +4169,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* orientated.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
- ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From dc7a10ddee0c56c6d891dd18de5c4ee9869545e0 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk(a)kernel.org>
Date: Fri, 30 Mar 2018 17:58:13 -0700
Subject: [PATCH] f2fs: truncate preallocated blocks in error case
If write is failed, we must deallocate the blocks that we couldn't write.
Cc: stable(a)vger.kernel.org
Reviewed-by: Chao Yu <yuchao0(a)huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk(a)kernel.org>
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8068b015ece5..6b94f19b3fa8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2911,6 +2911,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = generic_write_checks(iocb, from);
if (ret > 0) {
+ bool preallocated = false;
+ size_t target_size = 0;
int err;
if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
@@ -2927,6 +2929,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
} else {
+ preallocated = true;
+ target_size = iocb->ki_pos + iov_iter_count(from);
+
err = f2fs_preallocate_blocks(iocb, from);
if (err) {
clear_inode_flag(inode, FI_NO_PREALLOC);
@@ -2939,6 +2944,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
blk_finish_plug(&plug);
clear_inode_flag(inode, FI_NO_PREALLOC);
+ /* if we couldn't write data, we should deallocate blocks. */
+ if (preallocated && i_size_read(inode) < target_size)
+ f2fs_truncate(inode);
+
if (ret > 0)
f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 31286a8484a85e8b4e91ddb0f5415aee8a416827 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Date: Thu, 5 Apr 2018 16:23:05 -0700
Subject: [PATCH] mm: hwpoison: disable memory error handling on 1GB hugepage
Recently the following BUG was reported:
Injecting memory failure for pfn 0x3c0000 at process virtual address 0x7fe300000000
Memory failure: 0x3c0000: recovery action for huge page: Recovered
BUG: unable to handle kernel paging request at ffff8dfcc0003000
IP: gup_pgd_range+0x1f0/0xc20
PGD 17ae72067 P4D 17ae72067 PUD 0
Oops: 0000 [#1] SMP PTI
...
CPU: 3 PID: 5467 Comm: hugetlb_1gb Not tainted 4.15.0-rc8-mm1-abc+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.fc25 04/01/2014
You can easily reproduce this by calling madvise(MADV_HWPOISON) twice on
a 1GB hugepage. This happens because get_user_pages_fast() is not aware
of a migration entry on pud that was created in the 1st madvise() event.
I think that conversion to pud-aligned migration entry is working, but
other MM code walking over page table isn't prepared for it. We need
some time and effort to make all this work properly, so this patch
avoids the reported bug by just disabling error handling for 1GB
hugepage.
[n-horiguchi(a)ah.jp.nec.com: v2]
Link: http://lkml.kernel.org/r/1517284444-18149-1-git-send-email-n-horiguchi@ah.j…
Link: http://lkml.kernel.org/r/1517207283-15769-1-git-send-email-n-horiguchi@ah.j…
Signed-off-by: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org>
Reviewed-by: Mike Kravetz <mike.kravetz(a)oracle.com>
Acked-by: Punit Agrawal <punit.agrawal(a)arm.com>
Tested-by: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Anshuman Khandual <khandual(a)linux.vnet.ibm.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.vnet.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2e40a44a1fae..2e2be527642a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2613,6 +2613,7 @@ enum mf_action_page_type {
MF_MSG_POISONED_HUGE,
MF_MSG_HUGE,
MF_MSG_FREE_HUGE,
+ MF_MSG_NON_PMD_HUGE,
MF_MSG_UNMAP_FAILED,
MF_MSG_DIRTY_SWAPCACHE,
MF_MSG_CLEAN_SWAPCACHE,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8291b75f42c8..2d4bf647cf01 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -502,6 +502,7 @@ static const char * const action_page_types[] = {
[MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
[MF_MSG_HUGE] = "huge page",
[MF_MSG_FREE_HUGE] = "free huge page",
+ [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
[MF_MSG_UNMAP_FAILED] = "unmapping failed page",
[MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
[MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
@@ -1084,6 +1085,21 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
return 0;
}
+ /*
+ * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
+ * simply disable it. In order to make it work properly, we need
+ * make sure that:
+ * - conversion of a pud that maps an error hugetlb into hwpoison
+ * entry properly works, and
+ * - other mm code walking over page table is aware of pud-aligned
+ * hwpoison entries.
+ */
+ if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
+ action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
The patch below does not apply to the 4.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From db6516a5e7ddb6dc72d167b920f2f272596ea22d Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il(a)gmail.com>
Date: Sun, 13 May 2018 22:54:44 -0400
Subject: [PATCH] ext4: do not update s_last_mounted of a frozen fs
If fs is frozen after mount and before the first file open, the
update of s_last_mounted bypasses freeze protection and prints out
a WARNING splat:
$ mount /vdf
$ fsfreeze -f /vdf
$ cat /vdf/foo
[ 31.578555] WARNING: CPU: 1 PID: 1415 at
fs/ext4/ext4_jbd2.c:53 ext4_journal_check_start+0x48/0x82
[ 31.614016] Call Trace:
[ 31.614997] __ext4_journal_start_sb+0xe4/0x1a4
[ 31.616771] ? ext4_file_open+0xb6/0x189
[ 31.618094] ext4_file_open+0xb6/0x189
If fs is frozen, skip s_last_mounted update.
[backport hint: to apply to stable tree, need to apply also patches
vfs: add the sb_start_intwrite_trylock() helper
ext4: factor out helper ext4_sample_last_mounted()]
Cc: stable(a)vger.kernel.org
Fixes: bc0b0d6d69ee ("ext4: update the s_last_mounted field in the superblock")
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Reviewed-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index c48ea76b63e4..7f8023340eb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -393,7 +393,7 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
return 0;
- if (sb_rdonly(sb))
+ if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
return 0;
sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
@@ -407,21 +407,25 @@ static int ext4_sample_last_mounted(struct super_block *sb,
path.mnt = mnt;
path.dentry = mnt->mnt_root;
cp = d_path(&path, buf, sizeof(buf));
+ err = 0;
if (IS_ERR(cp))
- return 0;
+ goto out;
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ err = PTR_ERR(handle);
if (IS_ERR(handle))
- return PTR_ERR(handle);
+ goto out;
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
- goto out;
+ goto out_journal;
strlcpy(sbi->s_es->s_last_mounted, cp,
sizeof(sbi->s_es->s_last_mounted));
ext4_handle_dirty_super(handle, sb);
-out:
+out_journal:
ext4_journal_stop(handle);
+out:
+ sb_end_intwrite(sb);
return err;
}