diff --git a/include/linux/sched.h b/include/linux/sched.h index db660fc8aecc2f6587dde7dd0ee25758dbc50284..a78d877c5f254a93231864f50a3c3d30ac1cf2da 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -969,8 +969,39 @@ struct sched_statistics { }; #endif +#define RAVG_HIST_SIZE 5 + +/* ravg represents frequency scaled cpu-demand of tasks */ +struct ravg { + /* + * 'window_start' marks the beginning of new window + * + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous RAVG_HIST_SIZE + * windows. 'demand' could drive frequency demand for tasks. + */ + u64 window_start, mark_start; + u32 sum, demand; + u32 sum_history[RAVG_HIST_SIZE]; +}; + struct sched_entity { struct load_weight load; /* for load-balancing */ + /* + * Todo : Move ravg to 'struct task_struct', as this is common for both + * real-time and non-realtime tasks + */ + struct ravg ravg; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 99cdcf0edf715996498a51457137b20d801ba56a..cee67458c4e41f3c4c750a26ba210901bc247a94 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -36,6 +36,7 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_wake_to_idle; +extern unsigned int sysctl_sched_ravg_window; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5eb068721d206ed1a6e71efe10ed5d20a07227bc..701626c1d66ec94dca35999f2de4eac872660b35 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -784,6 +784,7 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); trace_sched_enq_deq_task(p, 1); + rq->cumulative_runnable_avg += p->se.ravg.demand; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -792,6 +793,8 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); trace_sched_enq_deq_task(p, 0); + rq->cumulative_runnable_avg -= p->se.ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -1343,6 +1346,110 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) wq_worker_waking_up(p, cpu_of(rq)); } +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static inline void +update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples) +{ + u32 *hist = &p->se.ravg.sum_history[0]; + int ridx, widx; + u32 max = 0; + + /* Ignore windows where task had no activity */ + if (!runtime) + return; + + /* Push new 'runtime' value onto stack */ + widx = RAVG_HIST_SIZE - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < RAVG_HIST_SIZE; widx++) { + hist[widx] = runtime; + if (hist[widx] > max) + max = hist[widx]; + } + + p->se.ravg.sum = 0; + if (p->on_rq) { + rq->cumulative_runnable_avg -= p->se.ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); + } + /* + * Maximum demand seen over previous RAVG_HIST_SIZE windows drives + * frequency demand for a task. Record maximum in 'demand' attribute. + */ + p->se.ravg.demand = max; + if (p->on_rq) + rq->cumulative_runnable_avg += p->se.ravg.demand; +} + +/* Window size (in ns) */ +__read_mostly unsigned int sysctl_sched_ravg_window = 50000000; + +void update_task_ravg(struct task_struct *p, struct rq *rq, int update_sum) +{ + u32 window_size = sysctl_sched_ravg_window; + int new_window; + u64 wallclock = sched_clock(); + + do { + s64 delta = 0; + int n; + u64 now = wallclock; + + new_window = 0; + delta = now - p->se.ravg.window_start; + BUG_ON(delta < 0); + if (delta > window_size) { + p->se.ravg.window_start += window_size; + now = p->se.ravg.window_start; + new_window = 1; + } + + if (update_sum) { + delta = now - p->se.ravg.mark_start; + BUG_ON(delta < 0); + + if (likely(rq->cur_freq && + rq->cur_freq <= max_possible_freq)) + delta = div64_u64(delta * rq->cur_freq, + max_possible_freq); + p->se.ravg.sum += delta; + WARN_ON(p->se.ravg.sum > window_size); + } + + if (!new_window) + break; + + update_history(rq, p, p->se.ravg.sum, 1); + + delta = wallclock - p->se.ravg.window_start; + BUG_ON(delta < 0); + n = div64_u64(delta, window_size); + if (n) { + if (!update_sum) + p->se.ravg.window_start = wallclock; + else + p->se.ravg.window_start += n * window_size; + BUG_ON(p->se.ravg.window_start > wallclock); + if (update_sum) + update_history(rq, p, window_size, n); + } + p->se.ravg.mark_start = p->se.ravg.window_start; + } while (new_window); + + p->se.ravg.mark_start = wallclock; +} + /* * Mark the task runnable and perform wakeup-preemption. */ @@ -1352,6 +1459,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) check_preempt_curr(rq, p, wake_flags); trace_sched_wakeup(p, true); + update_task_ravg(p, rq, 0); p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_woken) @@ -1626,6 +1734,8 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + int i; + p->on_rq = 0; p->se.on_rq = 0; @@ -1634,6 +1744,13 @@ static void __sched_fork(struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.ravg.sum = 0; + p->se.ravg.demand = 0; + p->se.ravg.window_start = 0; + p->se.ravg.mark_start = 0; + for (i = 0; i < RAVG_HIST_SIZE; ++i) + p->se.ravg.sum_history[i] = 0; + INIT_LIST_HEAD(&p->se.group_node); /* @@ -1777,6 +1894,7 @@ void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; + u64 wallclock = sched_clock(); raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP @@ -1790,6 +1908,8 @@ void wake_up_new_task(struct task_struct *p) rq = __task_rq_lock(p); activate_task(rq, p, 0); + p->se.ravg.window_start = wallclock; + p->se.ravg.mark_start = wallclock; p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); @@ -2904,6 +3024,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { + update_task_ravg(prev, rq, 1); if (prev->on_rq || rq->skip_clock_update < 0) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); @@ -2924,14 +3045,18 @@ pick_next_task(struct rq *rq) */ if (likely(rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq); - if (likely(p)) + if (likely(p)) { + update_task_ravg(p, rq, 1); return p; + } } for_each_class(class) { p = class->pick_next_task(rq); - if (p) + if (p) { + update_task_ravg(p, rq, 1); return p; + } } BUG(); /* the idle class will always have a runnable task */ @@ -7135,6 +7260,7 @@ void __init sched_init(void) rq->cur_freq = 0; rq->max_freq = 0; rq->min_freq = 0; + rq->cumulative_runnable_avg = 0; INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2cebc49f5e8419d59bdf5a7a130689ee5a231861..a6badc2c97236a59ab3cbe0f84bedb2414e4ee28 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -479,6 +479,7 @@ struct rq { #endif int cur_freq, max_freq, min_freq; + u64 cumulative_runnable_avg; #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 30a5df641b275613362c98a9718cc2371d858b36..f133e28e193b32a596b7699d4f161152bd2aab32 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -289,6 +289,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_ravg_window", + .data = &sysctl_sched_ravg_window, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns",