408 lines
12 KiB
Diff
408 lines
12 KiB
Diff
commit 6426c5b02d4aab620219b08a5d97ad8851b56b0d
|
|
Author: Tejun Heo <tj@kernel.org>
|
|
Date: Fri Mar 11 07:31:23 2016 -0500
|
|
|
|
sched: Misc preps for cgroup unified hierarchy interface
|
|
|
|
Make the following changes in preparation for the cpu controller
|
|
interface implementation for the unified hierarchy. This patch
|
|
doesn't cause any functional differences.
|
|
|
|
* s/cpu_stats_show()/cpu_cfs_stats_show()/
|
|
|
|
* s/cpu_files/cpu_legacy_files/
|
|
|
|
* Separate out cpuacct_stats_read() from cpuacct_stats_show(). While
|
|
at it, remove pointless cpuacct_stat_desc[] array.
|
|
|
|
Signed-off-by: Tejun Heo <tj@kernel.org>
|
|
Cc: Ingo Molnar <mingo@redhat.com>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Cc: Li Zefan <lizefan@huawei.com>
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index d1f7149..0d34f35 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -8371,7 +8371,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
|
return ret;
|
|
}
|
|
|
|
-static int cpu_stats_show(struct seq_file *sf, void *v)
|
|
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
|
|
{
|
|
struct task_group *tg = css_tg(seq_css(sf));
|
|
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
|
@@ -8411,7 +8411,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
|
|
}
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
|
|
|
-static struct cftype cpu_files[] = {
|
|
+static struct cftype cpu_legacy_files[] = {
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
{
|
|
.name = "shares",
|
|
@@ -8432,7 +8432,7 @@ static struct cftype cpu_files[] = {
|
|
},
|
|
{
|
|
.name = "stat",
|
|
- .seq_show = cpu_stats_show,
|
|
+ .seq_show = cpu_cfs_stats_show,
|
|
},
|
|
#endif
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
@@ -8457,7 +8457,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
|
.fork = cpu_cgroup_fork,
|
|
.can_attach = cpu_cgroup_can_attach,
|
|
.attach = cpu_cgroup_attach,
|
|
- .legacy_cftypes = cpu_files,
|
|
+ .legacy_cftypes = cpu_legacy_files,
|
|
.early_init = true,
|
|
};
|
|
|
|
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
|
index 4a81120..b99030a 100644
|
|
--- a/kernel/sched/cpuacct.c
|
|
+++ b/kernel/sched/cpuacct.c
|
|
@@ -180,36 +180,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
|
|
return 0;
|
|
}
|
|
|
|
-static const char * const cpuacct_stat_desc[] = {
|
|
- [CPUACCT_STAT_USER] = "user",
|
|
- [CPUACCT_STAT_SYSTEM] = "system",
|
|
-};
|
|
-
|
|
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
|
+static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp)
|
|
{
|
|
- struct cpuacct *ca = css_ca(seq_css(sf));
|
|
int cpu;
|
|
- s64 val = 0;
|
|
|
|
+ *userp = 0;
|
|
for_each_online_cpu(cpu) {
|
|
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
|
- val += kcpustat->cpustat[CPUTIME_USER];
|
|
- val += kcpustat->cpustat[CPUTIME_NICE];
|
|
+ *userp += kcpustat->cpustat[CPUTIME_USER];
|
|
+ *userp += kcpustat->cpustat[CPUTIME_NICE];
|
|
}
|
|
- val = cputime64_to_clock_t(val);
|
|
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
|
|
|
|
- val = 0;
|
|
+ *sysp = 0;
|
|
for_each_online_cpu(cpu) {
|
|
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
|
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
|
|
- val += kcpustat->cpustat[CPUTIME_IRQ];
|
|
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
|
+ *sysp += kcpustat->cpustat[CPUTIME_SYSTEM];
|
|
+ *sysp += kcpustat->cpustat[CPUTIME_IRQ];
|
|
+ *sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
|
}
|
|
+}
|
|
|
|
- val = cputime64_to_clock_t(val);
|
|
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
|
|
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
|
|
+{
|
|
+ cputime64_t user, sys;
|
|
|
|
+ cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys);
|
|
+ seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user));
|
|
+ seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys));
|
|
return 0;
|
|
}
|
|
|
|
|
|
commit d2a799f795a5d5a69c9dc365c34f926e0649f840
|
|
Author: Tejun Heo <tj@kernel.org>
|
|
Date: Fri Mar 11 07:31:23 2016 -0500
|
|
|
|
sched: Implement interface for cgroup unified hierarchy
|
|
|
|
While the cpu controller doesn't have any functional problems, there
|
|
are a couple interface issues which can be addressed in the v2
|
|
interface.
|
|
|
|
* cpuacct being a separate controller. This separation is artificial
|
|
and rather pointless as demonstrated by most use cases co-mounting
|
|
the two controllers. It also forces certain information to be
|
|
accounted twice.
|
|
|
|
* Use of different time units. Writable control knobs use
|
|
microseconds, some stat fields use nanoseconds while other cpuacct
|
|
stat fields use centiseconds.
|
|
|
|
* Control knobs which can't be used in the root cgroup still show up
|
|
in the root.
|
|
|
|
* Control knob names and semantics aren't consistent with other
|
|
controllers.
|
|
|
|
This patchset implements cpu controller's interface on the unified
|
|
hierarchy which adheres to the controller file conventions described
|
|
in Documentation/cgroups/unified-hierarchy.txt. Overall, the
|
|
following changes are made.
|
|
|
|
* cpuacct is implictly enabled and disabled by cpu and its information
|
|
is reported through "cpu.stat" which now uses microseconds for all
|
|
time durations. All time duration fields now have "_usec" appended
|
|
to them for clarity. While this doesn't solve the double accounting
|
|
immediately, once majority of users switch to v2, cpu can directly
|
|
account and report the relevant stats and cpuacct can be disabled on
|
|
the unified hierarchy.
|
|
|
|
Note that cpuacct.usage_percpu is currently not included in
|
|
"cpu.stat". If this information is actually called for, it can be
|
|
added later.
|
|
|
|
* "cpu.shares" is replaced with "cpu.weight" and operates on the
|
|
standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
|
|
The weight is scaled to scheduler weight so that 100 maps to 1024
|
|
and the ratio relationship is preserved - if weight is W and its
|
|
scaled value is S, W / 100 == S / 1024. While the mapped range is a
|
|
bit smaller than the orignal scheduler weight range, the dead zones
|
|
on both sides are relatively small and covers wider range than the
|
|
nice value mappings. This file doesn't make sense in the root
|
|
cgroup and isn't create on root.
|
|
|
|
* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
|
|
which contains both quota and period.
|
|
|
|
* "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
|
|
"cpu.rt.max" which contains both runtime and period.
|
|
|
|
v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
|
|
CFS bandwidth stats and also using raw division for u64. Use
|
|
CONFIG_CFS_BANDWITH and do_div() instead.
|
|
|
|
The semantics of "cpu.rt.max" is not fully decided yet. Dropped
|
|
for now.
|
|
|
|
Signed-off-by: Tejun Heo <tj@kernel.org>
|
|
Cc: Ingo Molnar <mingo@redhat.com>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Cc: Li Zefan <lizefan@huawei.com>
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 0d34f35..5990efc 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -8450,6 +8450,139 @@ static struct cftype cpu_legacy_files[] = {
|
|
{ } /* terminate */
|
|
};
|
|
|
|
+static int cpu_stats_show(struct seq_file *sf, void *v)
|
|
+{
|
|
+ cpuacct_cpu_stats_show(sf);
|
|
+
|
|
+#ifdef CONFIG_CFS_BANDWIDTH
|
|
+ {
|
|
+ struct task_group *tg = css_tg(seq_css(sf));
|
|
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
|
+ u64 throttled_usec;
|
|
+
|
|
+ throttled_usec = cfs_b->throttled_time;
|
|
+ do_div(throttled_usec, NSEC_PER_USEC);
|
|
+
|
|
+ seq_printf(sf, "nr_periods %d\n"
|
|
+ "nr_throttled %d\n"
|
|
+ "throttled_usec %llu\n",
|
|
+ cfs_b->nr_periods, cfs_b->nr_throttled,
|
|
+ throttled_usec);
|
|
+ }
|
|
+#endif
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
|
|
+ struct cftype *cft)
|
|
+{
|
|
+ struct task_group *tg = css_tg(css);
|
|
+ u64 weight = scale_load_down(tg->shares);
|
|
+
|
|
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
|
|
+}
|
|
+
|
|
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
|
|
+ struct cftype *cftype, u64 weight)
|
|
+{
|
|
+ /*
|
|
+ * cgroup weight knobs should use the common MIN, DFL and MAX
|
|
+ * values which are 1, 100 and 10000 respectively. While it loses
|
|
+ * a bit of range on both ends, it maps pretty well onto the shares
|
|
+ * value used by scheduler and the round-trip conversions preserve
|
|
+ * the original value over the entire range.
|
|
+ */
|
|
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
|
|
+ return -ERANGE;
|
|
+
|
|
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
|
|
+
|
|
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
|
|
+}
|
|
+#endif
|
|
+
|
|
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
|
|
+ long period, long quota)
|
|
+{
|
|
+ if (quota < 0)
|
|
+ seq_puts(sf, "max");
|
|
+ else
|
|
+ seq_printf(sf, "%ld", quota);
|
|
+
|
|
+ seq_printf(sf, " %ld\n", period);
|
|
+}
|
|
+
|
|
+/* caller should put the current value in *@periodp before calling */
|
|
+static int __maybe_unused cpu_period_quota_parse(char *buf,
|
|
+ u64 *periodp, u64 *quotap)
|
|
+{
|
|
+ char tok[21]; /* U64_MAX */
|
|
+
|
|
+ if (!sscanf(buf, "%s %llu", tok, periodp))
|
|
+ return -EINVAL;
|
|
+
|
|
+ *periodp *= NSEC_PER_USEC;
|
|
+
|
|
+ if (sscanf(tok, "%llu", quotap))
|
|
+ *quotap *= NSEC_PER_USEC;
|
|
+ else if (!strcmp(tok, "max"))
|
|
+ *quotap = RUNTIME_INF;
|
|
+ else
|
|
+ return -EINVAL;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_CFS_BANDWIDTH
|
|
+static int cpu_max_show(struct seq_file *sf, void *v)
|
|
+{
|
|
+ struct task_group *tg = css_tg(seq_css(sf));
|
|
+
|
|
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
|
|
+ char *buf, size_t nbytes, loff_t off)
|
|
+{
|
|
+ struct task_group *tg = css_tg(of_css(of));
|
|
+ u64 period = tg_get_cfs_period(tg);
|
|
+ u64 quota;
|
|
+ int ret;
|
|
+
|
|
+ ret = cpu_period_quota_parse(buf, &period, "a);
|
|
+ if (!ret)
|
|
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
|
|
+ return ret ?: nbytes;
|
|
+}
|
|
+#endif
|
|
+
|
|
+static struct cftype cpu_files[] = {
|
|
+ {
|
|
+ .name = "stat",
|
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
|
+ .seq_show = cpu_stats_show,
|
|
+ },
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
+ {
|
|
+ .name = "weight",
|
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
|
+ .read_u64 = cpu_weight_read_u64,
|
|
+ .write_u64 = cpu_weight_write_u64,
|
|
+ },
|
|
+#endif
|
|
+#ifdef CONFIG_CFS_BANDWIDTH
|
|
+ {
|
|
+ .name = "max",
|
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
|
+ .seq_show = cpu_max_show,
|
|
+ .write = cpu_max_write,
|
|
+ },
|
|
+#endif
|
|
+ { } /* terminate */
|
|
+};
|
|
+
|
|
struct cgroup_subsys cpu_cgrp_subsys = {
|
|
.css_alloc = cpu_cgroup_css_alloc,
|
|
.css_released = cpu_cgroup_css_released,
|
|
@@ -8458,7 +8591,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
|
|
.can_attach = cpu_cgroup_can_attach,
|
|
.attach = cpu_cgroup_attach,
|
|
.legacy_cftypes = cpu_legacy_files,
|
|
+ .dfl_cftypes = cpu_files,
|
|
.early_init = true,
|
|
+#ifdef CONFIG_CGROUP_CPUACCT
|
|
+ /*
|
|
+ * cpuacct is enabled together with cpu on the unified hierarchy
|
|
+ * and its stats are reported through "cpu.stat".
|
|
+ */
|
|
+ .depends_on = 1 << cpuacct_cgrp_id,
|
|
+#endif
|
|
};
|
|
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
|
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
|
|
index b99030a..a1a5a4b 100644
|
|
--- a/kernel/sched/cpuacct.c
|
|
+++ b/kernel/sched/cpuacct.c
|
|
@@ -227,6 +227,30 @@ static struct cftype files[] = {
|
|
{ } /* terminate */
|
|
};
|
|
|
|
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
|
|
+void cpuacct_cpu_stats_show(struct seq_file *sf)
|
|
+{
|
|
+ struct cgroup_subsys_state *css;
|
|
+ u64 usage, user, sys;
|
|
+
|
|
+ css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
|
|
+
|
|
+ usage = cpuusage_read(css, seq_cft(sf));
|
|
+ cpuacct_stats_read(css_ca(css), &user, &sys);
|
|
+
|
|
+ user *= TICK_NSEC;
|
|
+ sys *= TICK_NSEC;
|
|
+ do_div(usage, NSEC_PER_USEC);
|
|
+ do_div(user, NSEC_PER_USEC);
|
|
+ do_div(sys, NSEC_PER_USEC);
|
|
+
|
|
+ seq_printf(sf, "usage_usec %llu\n"
|
|
+ "user_usec %llu\n"
|
|
+ "system_usec %llu\n", usage, user, sys);
|
|
+
|
|
+ css_put(css);
|
|
+}
|
|
+
|
|
/*
|
|
* charge this task's execution time to its accounting group.
|
|
*
|
|
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
|
|
index ba72807..ddf7af4 100644
|
|
--- a/kernel/sched/cpuacct.h
|
|
+++ b/kernel/sched/cpuacct.h
|
|
@@ -2,6 +2,7 @@
|
|
|
|
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
|
extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
|
|
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
|
|
|
|
#else
|
|
|
|
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
|
|
{
|
|
}
|
|
|
|
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
|
|
+{
|
|
+}
|
|
+
|
|
#endif
|