diff --git a/include/linux/sched.h b/include/linux/sched.h index 484c090e9a81b83acd32038577326e22af7aa8aa..150bdd3f129c9eacf5d5b5422df382911b89472f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -997,11 +997,6 @@ struct ravg { struct sched_entity { struct load_weight load; /* for load-balancing */ - /* - * Todo : Move ravg to 'struct task_struct', as this is common for both - * real-time and non-realtime tasks - */ - struct ravg ravg; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; @@ -1080,6 +1075,7 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; + struct ravg ravg; #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 0e341af58ed1c94ed9c9eb8923a3343684ac8f0d..e93f844e469aa19fb2c7e8558d90c6bb04fa8534 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -36,7 +36,6 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_wake_to_idle; -extern unsigned int sysctl_sched_ravg_window; extern unsigned int sysctl_sched_wakeup_load_threshold; enum sched_tunable_scaling { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc705072e80e414debcbb1c25ee953e319cc3651..82d6c9f3e9d3d3fc4d97ae116095677ed83fdbdb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -784,7 +784,7 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); trace_sched_enq_deq_task(p, 1); - rq->cumulative_runnable_avg += p->se.ravg.demand; + inc_cumulative_runnable_avg(rq, p); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -793,8 +793,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); trace_sched_enq_deq_task(p, 0); - rq->cumulative_runnable_avg -= p->se.ravg.demand; - BUG_ON((s64)rq->cumulative_runnable_avg < 0); + dec_cumulative_runnable_avg(rq, p); } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -1346,6 +1345,15 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) wq_worker_waking_up(p, cpu_of(rq)); } +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = 10000000; + +/* Min window size (in ns) = 10ms */ +__read_mostly unsigned int min_sched_ravg_window = 10000000; + +/* Max window size (in ns) = 1s */ +__read_mostly unsigned int max_sched_ravg_window = 1000000000; + /* * Called when new window is starting for a task, to record cpu usage over * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 @@ -1355,9 +1363,9 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) static inline void update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples) { - u32 *hist = &p->se.ravg.sum_history[0]; + u32 *hist = &p->ravg.sum_history[0]; int ridx, widx; - u32 max = 0; + u32 sum = 0, avg; /* Ignore windows where task had no activity */ if (!runtime) @@ -1368,86 +1376,96 @@ update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples) ridx = widx - samples; for (; ridx >= 0; --widx, --ridx) { hist[widx] = hist[ridx]; - if (hist[widx] > max) - max = hist[widx]; + sum += hist[widx]; } for (widx = 0; widx < samples && widx < RAVG_HIST_SIZE; widx++) { hist[widx] = runtime; - if (hist[widx] > max) - max = hist[widx]; + sum += hist[widx]; } - p->se.ravg.sum = 0; + p->ravg.sum = 0; if (p->on_rq) { - rq->cumulative_runnable_avg -= p->se.ravg.demand; + rq->cumulative_runnable_avg -= p->ravg.demand; BUG_ON((s64)rq->cumulative_runnable_avg < 0); } - /* - * Maximum demand seen over previous RAVG_HIST_SIZE windows drives - * frequency demand for a task. Record maximum in 'demand' attribute. - */ - p->se.ravg.demand = max; + + avg = sum / RAVG_HIST_SIZE; + + p->ravg.demand = max(avg, runtime); + if (p->on_rq) - rq->cumulative_runnable_avg += p->se.ravg.demand; + rq->cumulative_runnable_avg += p->ravg.demand; } -/* Window size (in ns) */ -__read_mostly unsigned int sysctl_sched_ravg_window = 50000000; +static int __init set_sched_ravg_window(char *str) +{ + get_option(&str, &sched_ravg_window); + + return 0; +} + +early_param("sched_ravg_window", set_sched_ravg_window); void update_task_ravg(struct task_struct *p, struct rq *rq, int update_sum) { - u32 window_size = sysctl_sched_ravg_window; + u32 window_size = sched_ravg_window; int new_window; u64 wallclock = sched_clock(); + if (sched_ravg_window < min_sched_ravg_window) + return; + do { s64 delta = 0; int n; u64 now = wallclock; new_window = 0; - delta = now - p->se.ravg.window_start; + delta = now - p->ravg.window_start; BUG_ON(delta < 0); if (delta > window_size) { - p->se.ravg.window_start += window_size; - now = p->se.ravg.window_start; + p->ravg.window_start += window_size; + now = p->ravg.window_start; new_window = 1; } if (update_sum) { - delta = now - p->se.ravg.mark_start; + unsigned int cur_freq = rq->cur_freq; + + delta = now - p->ravg.mark_start; BUG_ON(delta < 0); - if (likely(rq->cur_freq && - rq->cur_freq <= max_possible_freq)) - delta = div64_u64(delta * rq->cur_freq, + if (unlikely(cur_freq > max_possible_freq)) + cur_freq = max_possible_freq; + + delta = div64_u64(delta * cur_freq, max_possible_freq); - p->se.ravg.sum += delta; - WARN_ON(p->se.ravg.sum > window_size); + p->ravg.sum += delta; + WARN_ON(p->ravg.sum > window_size); } if (!new_window) break; - update_history(rq, p, p->se.ravg.sum, 1); + update_history(rq, p, p->ravg.sum, 1); - delta = wallclock - p->se.ravg.window_start; + delta = wallclock - p->ravg.window_start; BUG_ON(delta < 0); n = div64_u64(delta, window_size); if (n) { if (!update_sum) - p->se.ravg.window_start = wallclock; + p->ravg.window_start = wallclock; else - p->se.ravg.window_start += n * window_size; - BUG_ON(p->se.ravg.window_start > wallclock); + p->ravg.window_start += n * window_size; + BUG_ON(p->ravg.window_start > wallclock); if (update_sum) update_history(rq, p, window_size, n); } - p->se.ravg.mark_start = p->se.ravg.window_start; + p->ravg.mark_start = p->ravg.window_start; } while (new_window); - p->se.ravg.mark_start = wallclock; + p->ravg.mark_start = wallclock; } /* @@ -1664,11 +1682,8 @@ out: mnd.src_cpu = src_cpu; mnd.dest_cpu = cpu; - if (sysctl_sched_ravg_window) - mnd.load = div64_u64((u64)p->se.ravg.demand * 100, - (u64)(sysctl_sched_ravg_window)); - else - mnd.load = 0; + mnd.load = pct_task_load(p); + /* * Call the migration notifier with mnd for foreground task * migrations as well as for wakeups if their load is above @@ -1754,8 +1769,6 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { - int i; - p->on_rq = 0; p->se.on_rq = 0; @@ -1764,12 +1777,7 @@ static void __sched_fork(struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; - p->se.ravg.sum = 0; - p->se.ravg.demand = 0; - p->se.ravg.window_start = 0; - p->se.ravg.mark_start = 0; - for (i = 0; i < RAVG_HIST_SIZE; ++i) - p->se.ravg.sum_history[i] = 0; + init_new_task_load(p); INIT_LIST_HEAD(&p->se.group_node); @@ -1914,7 +1922,6 @@ void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; - u64 wallclock = sched_clock(); raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP @@ -1928,8 +1935,6 @@ void wake_up_new_task(struct task_struct *p) rq = __task_rq_lock(p); activate_task(rq, p, 0); - p->se.ravg.window_start = wallclock; - p->se.ravg.mark_start = wallclock; p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); @@ -5062,11 +5067,7 @@ fail: mnd.src_cpu = src_cpu; mnd.dest_cpu = dest_cpu; - if (sysctl_sched_ravg_window) - mnd.load = div64_u64((u64)p->se.ravg.demand * 100, - (u64)(sysctl_sched_ravg_window)); - else - mnd.load = 0; + mnd.load = pct_task_load(p); atomic_notifier_call_chain(&migration_notifier_head, 0, (void *)&mnd); } @@ -7115,6 +7116,7 @@ static int cpufreq_notifier_trans(struct notifier_block *nb, if (val != CPUFREQ_POSTCHANGE) return 0; + BUG_ON(!new_freq); cpu_rq(cpu)->cur_freq = new_freq; return 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc68ac7f95c3f1176f7dc129b66f29beb017c16e..a5c4603497a9beac2d73036b8448fb1ba545ab32 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1118,6 +1118,39 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +static inline unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +/* Return task demand in percentage scale */ +unsigned int pct_task_load(struct task_struct *p) +{ + unsigned int load; + + load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load()); + + return load; +} + +void init_new_task_load(struct task_struct *p) +{ + int i; + u64 wallclock = sched_clock(); + + p->ravg.sum = 0; + p->ravg.demand = 0; + p->ravg.window_start = wallclock; + p->ravg.mark_start = wallclock; + for (i = 0; i < RAVG_HIST_SIZE; ++i) + p->ravg.sum_history[i] = 0; +} + /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) /* @@ -3855,6 +3888,7 @@ struct lb_env { }; static DEFINE_PER_CPU(bool, dbs_boost_needed); +static DEFINE_PER_CPU(int, dbs_boost_load_moved); /* * move_task - move a task from one runqueue to another runqueue. @@ -3984,7 +4018,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * * Called with both runqueues locked. */ -static int move_one_task(struct lb_env *env, int *total_run_moved) +static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; @@ -3999,9 +4033,7 @@ static int move_one_task(struct lb_env *env, int *total_run_moved) * stats here rather than inside move_task(). */ schedstat_inc(env->sd, lb_gained[env->idle]); - if (sysctl_sched_ravg_window) - *total_run_moved += div64_u64((u64)p->se.ravg.demand * - 100, (u64)(sysctl_sched_ravg_window)); + per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p); return 1; } @@ -4019,7 +4051,7 @@ static const unsigned int sched_nr_migrate_break = 32; * * Called with both runqueues locked. */ -static int move_tasks(struct lb_env *env, int *total_run_moved) +static int move_tasks(struct lb_env *env) { struct list_head *tasks = &env->src_rq->cfs_tasks; struct task_struct *p; @@ -4058,9 +4090,7 @@ static int move_tasks(struct lb_env *env, int *total_run_moved) move_task(p, env); pulled++; env->imbalance -= load; - if (sysctl_sched_ravg_window) - *total_run_moved += div64_u64((u64)p->se.ravg.demand * - 100, (u64)(sysctl_sched_ravg_window)); + per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p); #ifdef CONFIG_PREEMPT /* @@ -5033,7 +5063,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int total_run_moved = 0; struct sched_group *group; struct rq *busiest = NULL; unsigned long flags; @@ -5058,6 +5087,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); + per_cpu(dbs_boost_load_moved, this_cpu) = 0; schedstat_inc(sd, lb_count[idle]); redo: @@ -5103,7 +5133,7 @@ more_balance: * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ - cur_ld_moved = move_tasks(&env, &total_run_moved); + cur_ld_moved = move_tasks(&env); ld_moved += cur_ld_moved; double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); @@ -5223,13 +5253,16 @@ more_balance: if (per_cpu(dbs_boost_needed, this_cpu)) { struct migration_notify_data mnd; - per_cpu(dbs_boost_needed, this_cpu) = false; - mnd.src_cpu = cpu_of(busiest); mnd.dest_cpu = this_cpu; - mnd.load = total_run_moved; + mnd.load = per_cpu(dbs_boost_load_moved, this_cpu); + if (mnd.load > 100) + mnd.load = 100; atomic_notifier_call_chain(&migration_notifier_head, 0, (void *)&mnd); + per_cpu(dbs_boost_needed, this_cpu) = false; + per_cpu(dbs_boost_load_moved, this_cpu) = 0; + } } if (likely(!active_balance)) { @@ -5337,12 +5370,13 @@ static int active_load_balance_cpu_stop(void *data) struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; - int total_run_moved = 0; struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; raw_spin_lock_irq(&busiest_rq->lock); + per_cpu(dbs_boost_load_moved, target_cpu) = 0; + /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) @@ -5382,7 +5416,7 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); - if (move_one_task(&env, &total_run_moved)) + if (move_one_task(&env)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -5395,13 +5429,16 @@ out_unlock: if (per_cpu(dbs_boost_needed, target_cpu)) { struct migration_notify_data mnd; - per_cpu(dbs_boost_needed, target_cpu) = false; - mnd.src_cpu = cpu_of(busiest_rq); mnd.dest_cpu = target_cpu; - mnd.load = total_run_moved; + mnd.load = per_cpu(dbs_boost_load_moved, target_cpu); + if (mnd.load > 100) + mnd.load = 100; atomic_notifier_call_chain(&migration_notifier_head, 0, (void *)&mnd); + + per_cpu(dbs_boost_needed, target_cpu) = false; + per_cpu(dbs_boost_load_moved, target_cpu) = 0; } return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 93e2dffe6ea028de0be0e57e959c5d606255bfb1..0d58d4233501599be47ed19d5b987a2e5116f986 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -12,7 +12,6 @@ extern __read_mostly int scheduler_running; -extern unsigned int sysctl_sched_ravg_window; /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -653,6 +652,23 @@ extern int group_balance_cpu(struct sched_group *sg); #include "stats.h" #include "auto_group.h" +extern unsigned int sched_ravg_window; +extern unsigned int pct_task_load(struct task_struct *p); +extern void init_new_task_load(struct task_struct *p); + +static inline void +inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + rq->cumulative_runnable_avg += p->ravg.demand; +} + +static inline void +dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + rq->cumulative_runnable_avg -= p->ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); +} + #ifdef CONFIG_CGROUP_SCHED /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2e387e6c40678a2cdc9995aaeb761c2cbe8cf6ff..39eba442e7862b738c852bf6684bb62ccd9aefad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -289,13 +289,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "sched_ravg_window", - .data = &sysctl_sched_ravg_window, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "sched_wakeup_load_threshold", .data = &sysctl_sched_wakeup_load_threshold,