From: Frederic Weisbecker frederic@kernel.org
[ Upstream commit e7f2be115f0746b969c0df14c0d182f65f005ca5 ]
getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime than the actual time.
task_cputime_adjusted() snapshots utime and stime and then adjust their sum to match the scheduler maintained cputime.sum_exec_runtime. Unfortunately in nohz_full, sum_exec_runtime is only updated once per second in the worst case, causing a discrepancy against utime and stime that can be updated anytime by the reader using vtime.
To fix this situation, perform an update of cputime.sum_exec_runtime when the cputime snapshot reports the task as actually running while the tick is disabled. The related overhead is then contained within the relevant situations.
Reported-by: Hasegawa Hitomi hasegawa-hitomi@fujitsu.com Signed-off-by: Frederic Weisbecker frederic@kernel.org Signed-off-by: Hasegawa Hitomi hasegawa-hitomi@fujitsu.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Masayoshi Mizuma m.mizuma@jp.fujitsu.com Acked-by: Phil Auld pauld@redhat.com Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- include/linux/sched/cputime.h | 5 +++-- kernel/sched/cputime.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 6c9f19a33865a..ce3c58286062c 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -18,15 +18,16 @@ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, +extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); extern u64 task_gtime(struct task_struct *t); #else -static inline void task_cputime(struct task_struct *t, +static inline bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { *utime = t->utime; *stime = t->stime; + return false; }
static inline u64 task_gtime(struct task_struct *t) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d23004524..9cd34730cf623 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -617,7 +617,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) .sum_exec_runtime = p->se.sum_exec_runtime, };
- task_cputime(p, &cputime.utime, &cputime.stime); + if (task_cputime(p, &cputime.utime, &cputime.stime)) + cputime.sum_exec_runtime = task_sched_runtime(p); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } EXPORT_SYMBOL_GPL(task_cputime_adjusted); @@ -830,19 +831,21 @@ u64 task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) +bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { struct vtime *vtime = &t->vtime; unsigned int seq; u64 delta; + int ret;
if (!vtime_accounting_enabled()) { *utime = t->utime; *stime = t->stime; - return; + return false; }
do { + ret = false; seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime; @@ -852,6 +855,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (vtime->state < VTIME_SYS) continue;
+ ret = true; delta = vtime_delta(vtime);
/* @@ -863,6 +867,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) else *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return ret; }
static int vtime_state_fetch(struct vtime *vtime, int cpu)
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit e90e51d5f01d2baae5dcce280866bbb96816e978 ]
There is nothing to synchronize if APICv is disabled, since neither other vCPUs nor assigned devices can set PIR.ON.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5b7664d51dc2b..dff8ab5a53280 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7814,10 +7814,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; }
- if (!cpu_has_vmx_apicv()) { + if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - }
if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true;
On 12/13/21 15:20, Sasha Levin wrote:
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit e90e51d5f01d2baae5dcce280866bbb96816e978 ]
There is nothing to synchronize if APICv is disabled, since neither other vCPUs nor assigned devices can set PIR.ON.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5b7664d51dc2b..dff8ab5a53280 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7814,10 +7814,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; }
- if (!cpu_has_vmx_apicv()) {
- if (!cpu_has_vmx_apicv()) enable_apicv = 0;
- if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL;
- }
if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true;
NACK - the patch is only okay to backport for 5.15
Paolo
From: Vitaly Kuznetsov vkuznets@redhat.com
[ Upstream commit 908fa88e420f30dde6d80f092795a18ec72ca6d3 ]
With the elevated 'KVM_CAP_MAX_VCPUS' value kvm_create_max_vcpus test may hit RLIMIT_NOFILE limits:
# ./kvm_create_max_vcpus KVM_CAP_MAX_VCPU_ID: 4096 KVM_CAP_MAX_VCPUS: 1024 Testing creating 1024 vCPUs, with IDs 0...1023. /dev/kvm not available (errno: 24), skipping test
Adjust RLIMIT_NOFILE limits to make sure KVM_CAP_MAX_VCPUS fds can be opened. Note, raising hard limit ('rlim_max') requires CAP_SYS_RESOURCE capability which is generally not needed to run kvm selftests (but without raising the limit the test is doomed to fail anyway).
Signed-off-by: Vitaly Kuznetsov vkuznets@redhat.com Message-Id: 20211123135953.667434-1-vkuznets@redhat.com [Skip the test if the hard limit can be raised. - Paolo] Reviewed-by: Sean Christopherson seanjc@google.com Tested-by: Sean Christopherson seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- .../selftests/kvm/kvm_create_max_vcpus.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+)
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 0299cd81b8ba2..aa3795cd7bd3d 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -12,6 +12,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/resource.h>
#include "test_util.h"
@@ -40,10 +41,39 @@ int main(int argc, char *argv[]) { int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + /* + * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds + + * an arbitrary number for everything else. + */ + int nr_fds_wanted = kvm_max_vcpus + 100; + struct rlimit rl;
pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
+ /* + * Check that we're allowed to open nr_fds_wanted file descriptors and + * try raising the limits if needed. + */ + TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!"); + + if (rl.rlim_cur < nr_fds_wanted) { + rl.rlim_cur = nr_fds_wanted; + if (rl.rlim_max < nr_fds_wanted) { + int old_rlim_max = rl.rlim_max; + rl.rlim_max = nr_fds_wanted; + + int r = setrlimit(RLIMIT_NOFILE, &rl); + if (r < 0) { + printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n", + old_rlim_max, nr_fds_wanted); + exit(KSFT_SKIP); + } + } else { + TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); + } + } + /* * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID
On 12/13/21 15:20, Sasha Levin wrote:
From: Vitaly Kuznetsov vkuznets@redhat.com
[ Upstream commit 908fa88e420f30dde6d80f092795a18ec72ca6d3 ]
With the elevated 'KVM_CAP_MAX_VCPUS' value kvm_create_max_vcpus test may hit RLIMIT_NOFILE limits:
# ./kvm_create_max_vcpus KVM_CAP_MAX_VCPU_ID: 4096 KVM_CAP_MAX_VCPUS: 1024 Testing creating 1024 vCPUs, with IDs 0...1023. /dev/kvm not available (errno: 24), skipping test
Adjust RLIMIT_NOFILE limits to make sure KVM_CAP_MAX_VCPUS fds can be opened. Note, raising hard limit ('rlim_max') requires CAP_SYS_RESOURCE capability which is generally not needed to run kvm selftests (but without raising the limit the test is doomed to fail anyway).
Signed-off-by: Vitaly Kuznetsov vkuznets@redhat.com Message-Id: 20211123135953.667434-1-vkuznets@redhat.com [Skip the test if the hard limit can be raised. - Paolo] Reviewed-by: Sean Christopherson seanjc@google.com Tested-by: Sean Christopherson seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
.../selftests/kvm/kvm_create_max_vcpus.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+)
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index 0299cd81b8ba2..aa3795cd7bd3d 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -12,6 +12,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/resource.h> #include "test_util.h" @@ -40,10 +41,39 @@ int main(int argc, char *argv[]) { int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
- /*
* Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds +
* an arbitrary number for everything else.
*/
- int nr_fds_wanted = kvm_max_vcpus + 100;
- struct rlimit rl;
pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
- /*
* Check that we're allowed to open nr_fds_wanted file descriptors and
* try raising the limits if needed.
*/
- TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
- if (rl.rlim_cur < nr_fds_wanted) {
rl.rlim_cur = nr_fds_wanted;
if (rl.rlim_max < nr_fds_wanted) {
int old_rlim_max = rl.rlim_max;
rl.rlim_max = nr_fds_wanted;
int r = setrlimit(RLIMIT_NOFILE, &rl);
if (r < 0) {
printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n",
old_rlim_max, nr_fds_wanted);
exit(KSFT_SKIP);
}
} else {
TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
}
- }
- /*
- Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID.
- Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID
Acked-by: Paolo Bonzini pbonzini@redhat.com
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 5f25e71e311478f9bb0a8ef49e7d8b95316491d7 ]
This is not an unrecoverable situation. Users of kvm_read_guest_offset_cached and kvm_write_guest_offset_cached must expect the read/write to fail, and therefore it is possible to just return early with an error value.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- virt/kvm/kvm_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 97ac3c6fd4441..4a7d377b3a500 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2590,7 +2590,8 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int r; gpa_t gpa = ghc->gpa + offset;
- BUG_ON(len + offset > ghc->len); + if (WARN_ON_ONCE(len + offset > ghc->len)) + return -EINVAL;
if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) @@ -2627,7 +2628,8 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int r; gpa_t gpa = ghc->gpa + offset;
- BUG_ON(len + offset > ghc->len); + if (WARN_ON_ONCE(len + offset > ghc->len)) + return -EINVAL;
if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
On 12/13/21 15:20, Sasha Levin wrote:
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 5f25e71e311478f9bb0a8ef49e7d8b95316491d7 ]
This is not an unrecoverable situation. Users of kvm_read_guest_offset_cached and kvm_write_guest_offset_cached must expect the read/write to fail, and therefore it is possible to just return early with an error value.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
virt/kvm/kvm_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 97ac3c6fd4441..4a7d377b3a500 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2590,7 +2590,8 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int r; gpa_t gpa = ghc->gpa + offset;
- BUG_ON(len + offset > ghc->len);
- if (WARN_ON_ONCE(len + offset > ghc->len))
return -EINVAL;
if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) @@ -2627,7 +2628,8 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int r; gpa_t gpa = ghc->gpa + offset;
- BUG_ON(len + offset > ghc->len);
- if (WARN_ON_ONCE(len + offset > ghc->len))
return -EINVAL;
if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
Acked-by: Paolo Bonzini pbonzini@redhat.com
linux-stable-mirror@lists.linaro.org