Commit 3431a940 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'x86-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 AVX512 status update from Ingo Molnar:
 "This adds a new ABI that the main scheduler probably doesn't want to
  deal with but HPC job schedulers might want to use: the
  AVX512_elapsed_ms field in the new /proc/<pid>/arch_status task status
  file, which allows the user-space job scheduler to cluster such tasks,
  to avoid turbo frequency drops"

* 'x86-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  Documentation/filesystems/proc.txt: Add arch_status file
  x86/process: Add AVX-512 usage elapsed time to /proc/pid/arch_status
  proc: Add /proc/<pid>/arch_status
parents 5b7a2095 711486fd
Loading
Loading
Loading
Loading
+40 −0
Original line number Diff line number Diff line
@@ -45,6 +45,7 @@ Table of Contents
  3.9   /proc/<pid>/map_files - Information about memory mapped files
  3.10  /proc/<pid>/timerslack_ns - Task timerslack value
  3.11	/proc/<pid>/patch_state - Livepatch patch operation state
  3.12	/proc/<pid>/arch_status - Task architecture specific information

  4	Configuring procfs
  4.1	Mount options
@@ -1948,6 +1949,45 @@ patched. If the patch is being enabled, then the task has already been
patched.  If the patch is being disabled, then the task hasn't been
unpatched yet.

3.12 /proc/<pid>/arch_status - task architecture specific status
-------------------------------------------------------------------
When CONFIG_PROC_PID_ARCH_STATUS is enabled, this file displays the
architecture specific status of the task.

Example
-------
 $ cat /proc/6753/arch_status
 AVX512_elapsed_ms:      8

Description
-----------

x86 specific entries:
---------------------
 AVX512_elapsed_ms:
 ------------------
  If AVX512 is supported on the machine, this entry shows the milliseconds
  elapsed since the last time AVX512 usage was recorded. The recording
  happens on a best effort basis when a task is scheduled out. This means
  that the value depends on two factors:

    1) The time which the task spent on the CPU without being scheduled
       out. With CPU isolation and a single runnable task this can take
       several seconds.

    2) The time since the task was scheduled out last. Depending on the
       reason for being scheduled out (time slice exhausted, syscall ...)
       this can be arbitrary long time.

  As a consequence the value cannot be considered precise and authoritative
  information. The application which uses this information has to be aware
  of the overall scenario on the system in order to determine whether a
  task is a real AVX512 user or not. Precise information can be obtained
  with performance counters.

  A special value of '-1' indicates that no AVX512 usage was recorded, thus
  the task is unlikely an AVX512 user, but depends on the workload and the
  scheduling scenario, it also could be a false negative mentioned above.

------------------------------------------------------------------------------
Configuring procfs
+1 −0
Original line number Diff line number Diff line
@@ -220,6 +220,7 @@ config X86
	select USER_STACKTRACE_SUPPORT
	select VIRT_TO_BUS
	select X86_FEATURE_NAMES		if PROC_FS
	select PROC_PID_ARCH_STATUS		if PROC_FS

config INSTRUCTION_DECODER
	def_bool y
+47 −0
Original line number Diff line number Diff line
@@ -8,6 +8,8 @@
#include <linux/cpu.h>
#include <linux/mman.h>
#include <linux/pkeys.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>

#include <asm/fpu/api.h>
#include <asm/fpu/internal.h>
@@ -1231,3 +1233,48 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)

	return 0;
}

#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
 * Report the amount of time elapsed in millisecond since last AVX512
 * use in the task.
 */
static void avx512_status(struct seq_file *m, struct task_struct *task)
{
	unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp);
	long delta;

	if (!timestamp) {
		/*
		 * Report -1 if no AVX512 usage
		 */
		delta = -1;
	} else {
		delta = (long)(jiffies - timestamp);
		/*
		 * Cap to LONG_MAX if time difference > LONG_MAX
		 */
		if (delta < 0)
			delta = LONG_MAX;
		delta = jiffies_to_msecs(delta);
	}

	seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta);
	seq_putc(m, '\n');
}

/*
 * Report architecture specific information
 */
int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
			struct pid *pid, struct task_struct *task)
{
	/*
	 * Report AVX512 state if the processor and build option supported.
	 */
	if (cpu_feature_enabled(X86_FEATURE_AVX512F))
		avx512_status(m, task);

	return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+4 −0
Original line number Diff line number Diff line
@@ -98,3 +98,7 @@ config PROC_CHILDREN

	  Say Y if you are running any user-space software which takes benefit from
	  this interface. For example, rkt is such a piece of software.

config PROC_PID_ARCH_STATUS
	def_bool n
	depends on PROC_FS
+6 −0
Original line number Diff line number Diff line
@@ -3061,6 +3061,9 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_STACKLEAK_METRICS
	ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
};

static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3448,6 +3451,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
	ONE("arch_status", S_IRUGO, proc_pid_arch_status),
#endif
};

static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
Loading