From 77dc31801de7a4f28e5880ef4883bdd96423384a Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@codeaurora.org>
Date: Sat, 29 Mar 2014 11:40:16 -0700
Subject: [PATCH] sched: Window-based load stat improvements

Some tasks can have a sporadic load pattern such that they can suddenly
start running for longer intervals of time after running for shorter
durations. To recognize such sharp increase in tasks' demands, max
between the average of 5 window load samples and the most recent sample
is chosen as the task demand.

Make the window size (sched_ravg_window) configurable at boot up
time. To prevent users from setting inappropriate values for window
size, min and max limits are defined. As 'ravg' struct tracks load for
both real-time and non real-time tasks it is moved out of sched_entity
struct.

In order to prevent changing function signatures for move_tasks() and
move_one_task() per-cpu variables are defined to track the total load
moved. In case multiple tasks are selected to migrate in one load
balance operation, loads > 100 could be sent through migration notifiers.
Prevent this scenario by setting mnd.load to 100 in such cases.

Define wrapper functions to compute cpu demands for tasks and to change
rq->cumulative_runnable_avg.

Change-Id: I9abfbf3b5fe23ae615a6acd3db9580cfdeb515b4
Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org>
Signed-off-by: Rohit Gupta <rohgup@codeaurora.org>
---
 include/linux/sched.h        |   6 +-
 include/linux/sched/sysctl.h |   1 -
 kernel/sched/core.c          | 116 ++++++++++++++++++-----------------
 kernel/sched/fair.c          |  73 ++++++++++++++++------
 kernel/sched/sched.h         |  18 +++++-
 kernel/sysctl.c              |   7 ---
 6 files changed, 132 insertions(+), 89 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 484c090e9a81..150bdd3f129c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -997,11 +997,6 @@ struct ravg {
 
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
-	/*
-	 * Todo : Move ravg to 'struct task_struct', as this is common for both
-	 * real-time and non-realtime tasks
-	 */
-	struct ravg		ravg;
 	struct rb_node		run_node;
 	struct list_head	group_node;
 	unsigned int		on_rq;
@@ -1080,6 +1075,7 @@ struct task_struct {
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+	struct ravg ravg;
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 0e341af58ed1..e93f844e469a 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -36,7 +36,6 @@ extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_wake_to_idle;
-extern unsigned int sysctl_sched_ravg_window;
 extern unsigned int sysctl_sched_wakeup_load_threshold;
 
 enum sched_tunable_scaling {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc705072e80e..82d6c9f3e9d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -784,7 +784,7 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, flags);
 	trace_sched_enq_deq_task(p, 1);
-	rq->cumulative_runnable_avg += p->se.ravg.demand;
+	inc_cumulative_runnable_avg(rq, p);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -793,8 +793,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	sched_info_dequeued(p);
 	p->sched_class->dequeue_task(rq, p, flags);
 	trace_sched_enq_deq_task(p, 0);
-	rq->cumulative_runnable_avg -= p->se.ravg.demand;
-	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+	dec_cumulative_runnable_avg(rq, p);
 }
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1346,6 +1345,15 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = 10000000;
+
+/* Min window size (in ns) = 10ms */
+__read_mostly unsigned int min_sched_ravg_window = 10000000;
+
+/* Max window size (in ns) = 1s */
+__read_mostly unsigned int max_sched_ravg_window = 1000000000;
+
 /*
  * Called when new window is starting for a task, to record cpu usage over
  * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
@@ -1355,9 +1363,9 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 static inline void
 update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples)
 {
-	u32 *hist = &p->se.ravg.sum_history[0];
+	u32 *hist = &p->ravg.sum_history[0];
 	int ridx, widx;
-	u32 max = 0;
+	u32 sum = 0, avg;
 
 	/* Ignore windows where task had no activity */
 	if (!runtime)
@@ -1368,86 +1376,96 @@ update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples)
 	ridx = widx - samples;
 	for (; ridx >= 0; --widx, --ridx) {
 		hist[widx] = hist[ridx];
-		if  (hist[widx] > max)
-			max = hist[widx];
+		sum += hist[widx];
 	}
 
 	for (widx = 0; widx < samples && widx < RAVG_HIST_SIZE; widx++) {
 		hist[widx] = runtime;
-		if  (hist[widx] > max)
-			max = hist[widx];
+		sum += hist[widx];
 	}
 
-	p->se.ravg.sum = 0;
+	p->ravg.sum = 0;
 	if (p->on_rq) {
-		rq->cumulative_runnable_avg -= p->se.ravg.demand;
+		rq->cumulative_runnable_avg -= p->ravg.demand;
 		BUG_ON((s64)rq->cumulative_runnable_avg < 0);
 	}
-	/*
-	 * Maximum demand seen over previous RAVG_HIST_SIZE windows drives
-	 * frequency demand for a task. Record maximum in 'demand' attribute.
-	 */
-	p->se.ravg.demand = max;
+
+	avg = sum / RAVG_HIST_SIZE;
+
+	p->ravg.demand = max(avg, runtime);
+
 	if (p->on_rq)
-		rq->cumulative_runnable_avg += p->se.ravg.demand;
+		rq->cumulative_runnable_avg += p->ravg.demand;
 }
 
-/* Window size (in ns) */
-__read_mostly unsigned int sysctl_sched_ravg_window = 50000000;
+static int __init set_sched_ravg_window(char *str)
+{
+	get_option(&str, &sched_ravg_window);
+
+	return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
 
 void update_task_ravg(struct task_struct *p, struct rq *rq, int update_sum)
 {
-	u32 window_size = sysctl_sched_ravg_window;
+	u32 window_size = sched_ravg_window;
 	int new_window;
 	u64 wallclock = sched_clock();
 
+	if (sched_ravg_window < min_sched_ravg_window)
+		return;
+
 	do {
 		s64 delta = 0;
 		int n;
 		u64 now = wallclock;
 
 		new_window = 0;
-		delta = now - p->se.ravg.window_start;
+		delta = now - p->ravg.window_start;
 		BUG_ON(delta < 0);
 		if (delta > window_size) {
-			p->se.ravg.window_start += window_size;
-			now = p->se.ravg.window_start;
+			p->ravg.window_start += window_size;
+			now = p->ravg.window_start;
 			new_window = 1;
 		}
 
 		if (update_sum) {
-			delta = now - p->se.ravg.mark_start;
+			unsigned int cur_freq = rq->cur_freq;
+
+			delta = now - p->ravg.mark_start;
 			BUG_ON(delta < 0);
 
-			if (likely(rq->cur_freq &&
-					rq->cur_freq <= max_possible_freq))
-				delta = div64_u64(delta  * rq->cur_freq,
+			if (unlikely(cur_freq > max_possible_freq))
+				cur_freq = max_possible_freq;
+
+			delta = div64_u64(delta  * cur_freq,
 							max_possible_freq);
-			p->se.ravg.sum += delta;
-			WARN_ON(p->se.ravg.sum > window_size);
+			p->ravg.sum += delta;
+			WARN_ON(p->ravg.sum > window_size);
 		}
 
 		if (!new_window)
 			break;
 
-		update_history(rq, p, p->se.ravg.sum, 1);
+		update_history(rq, p, p->ravg.sum, 1);
 
-		delta = wallclock - p->se.ravg.window_start;
+		delta = wallclock - p->ravg.window_start;
 		BUG_ON(delta < 0);
 		n = div64_u64(delta, window_size);
 		if (n) {
 			if (!update_sum)
-				p->se.ravg.window_start = wallclock;
+				p->ravg.window_start = wallclock;
 			else
-				p->se.ravg.window_start += n * window_size;
-			BUG_ON(p->se.ravg.window_start > wallclock);
+				p->ravg.window_start += n * window_size;
+			BUG_ON(p->ravg.window_start > wallclock);
 			if (update_sum)
 				update_history(rq, p, window_size, n);
 		}
-		p->se.ravg.mark_start =  p->se.ravg.window_start;
+		p->ravg.mark_start =  p->ravg.window_start;
 	} while (new_window);
 
-	p->se.ravg.mark_start = wallclock;
+	p->ravg.mark_start = wallclock;
 }
 
 /*
@@ -1664,11 +1682,8 @@ out:
 
 		mnd.src_cpu = src_cpu;
 		mnd.dest_cpu = cpu;
-		if (sysctl_sched_ravg_window)
-			mnd.load = div64_u64((u64)p->se.ravg.demand * 100,
-				(u64)(sysctl_sched_ravg_window));
-		else
-			mnd.load = 0;
+		mnd.load = pct_task_load(p);
+
 		/*
 		 * Call the migration notifier with mnd for foreground task
 		 * migrations as well as for wakeups if their load is above
@@ -1754,8 +1769,6 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  */
 static void __sched_fork(struct task_struct *p)
 {
-	int i;
-
 	p->on_rq			= 0;
 
 	p->se.on_rq			= 0;
@@ -1764,12 +1777,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
-	p->se.ravg.sum			= 0;
-	p->se.ravg.demand		= 0;
-	p->se.ravg.window_start		= 0;
-	p->se.ravg.mark_start		= 0;
-	for (i = 0; i < RAVG_HIST_SIZE; ++i)
-		p->se.ravg.sum_history[i] = 0;
+	init_new_task_load(p);
 
 	INIT_LIST_HEAD(&p->se.group_node);
 
@@ -1914,7 +1922,6 @@ void wake_up_new_task(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
-	u64 wallclock = sched_clock();
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
@@ -1928,8 +1935,6 @@ void wake_up_new_task(struct task_struct *p)
 
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
-	p->se.ravg.window_start	= wallclock;
-	p->se.ravg.mark_start	= wallclock;
 	p->on_rq = 1;
 	trace_sched_wakeup_new(p, true);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -5062,11 +5067,7 @@ fail:
 
 		mnd.src_cpu = src_cpu;
 		mnd.dest_cpu = dest_cpu;
-		if (sysctl_sched_ravg_window)
-			mnd.load = div64_u64((u64)p->se.ravg.demand * 100,
-				(u64)(sysctl_sched_ravg_window));
-		else
-			mnd.load = 0;
+		mnd.load = pct_task_load(p);
 		atomic_notifier_call_chain(&migration_notifier_head,
 					   0, (void *)&mnd);
 	}
@@ -7115,6 +7116,7 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	if (val != CPUFREQ_POSTCHANGE)
 		return 0;
 
+	BUG_ON(!new_freq);
 	cpu_rq(cpu)->cur_freq = new_freq;
 
 	return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc68ac7f95c3..a5c4603497a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1118,6 +1118,39 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline unsigned int task_load(struct task_struct *p)
+{
+	return p->ravg.demand;
+}
+
+static inline unsigned int max_task_load(void)
+{
+	return sched_ravg_window;
+}
+
+/* Return task demand in percentage scale */
+unsigned int pct_task_load(struct task_struct *p)
+{
+	unsigned int load;
+
+	load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
+
+	return load;
+}
+
+void init_new_task_load(struct task_struct *p)
+{
+	int i;
+	u64 wallclock = sched_clock();
+
+	p->ravg.sum			= 0;
+	p->ravg.demand			= 0;
+	p->ravg.window_start		= wallclock;
+	p->ravg.mark_start		= wallclock;
+	for (i = 0; i < RAVG_HIST_SIZE; ++i)
+		p->ravg.sum_history[i] = 0;
+}
+
 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
 /*
@@ -3855,6 +3888,7 @@ struct lb_env {
 };
 
 static DEFINE_PER_CPU(bool, dbs_boost_needed);
+static DEFINE_PER_CPU(int, dbs_boost_load_moved);
 
 /*
  * move_task - move a task from one runqueue to another runqueue.
@@ -3984,7 +4018,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  *
  * Called with both runqueues locked.
  */
-static int move_one_task(struct lb_env *env, int *total_run_moved)
+static int move_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
@@ -3999,9 +4033,7 @@ static int move_one_task(struct lb_env *env, int *total_run_moved)
 		 * stats here rather than inside move_task().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-		if (sysctl_sched_ravg_window)
-			*total_run_moved += div64_u64((u64)p->se.ravg.demand *
-					100, (u64)(sysctl_sched_ravg_window));
+		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 		return 1;
 	}
@@ -4019,7 +4051,7 @@ static const unsigned int sched_nr_migrate_break = 32;
  *
  * Called with both runqueues locked.
  */
-static int move_tasks(struct lb_env *env, int *total_run_moved)
+static int move_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
@@ -4058,9 +4090,7 @@ static int move_tasks(struct lb_env *env, int *total_run_moved)
 		move_task(p, env);
 		pulled++;
 		env->imbalance -= load;
-		if (sysctl_sched_ravg_window)
-			*total_run_moved += div64_u64((u64)p->se.ravg.demand *
-					100, (u64)(sysctl_sched_ravg_window));
+		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -5033,7 +5063,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			int *balance)
 {
 	int ld_moved, cur_ld_moved, active_balance = 0;
-	int total_run_moved = 0;
 	struct sched_group *group;
 	struct rq *busiest = NULL;
 	unsigned long flags;
@@ -5058,6 +5087,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpumask_copy(cpus, cpu_active_mask);
 
+	per_cpu(dbs_boost_load_moved, this_cpu) = 0;
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
@@ -5103,7 +5133,7 @@ more_balance:
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
-		cur_ld_moved = move_tasks(&env, &total_run_moved);
+		cur_ld_moved = move_tasks(&env);
 		ld_moved += cur_ld_moved;
 		double_rq_unlock(env.dst_rq, busiest);
 		local_irq_restore(flags);
@@ -5223,13 +5253,16 @@ more_balance:
 		if (per_cpu(dbs_boost_needed, this_cpu)) {
 			struct migration_notify_data mnd;
 
-			per_cpu(dbs_boost_needed, this_cpu) = false;
-
 			mnd.src_cpu = cpu_of(busiest);
 			mnd.dest_cpu = this_cpu;
-			mnd.load = total_run_moved;
+			mnd.load = per_cpu(dbs_boost_load_moved, this_cpu);
+			if (mnd.load > 100)
+				mnd.load = 100;
 			atomic_notifier_call_chain(&migration_notifier_head,
 						   0, (void *)&mnd);
+			per_cpu(dbs_boost_needed, this_cpu) = false;
+			per_cpu(dbs_boost_load_moved, this_cpu) = 0;
+
 		}
 	}
 	if (likely(!active_balance)) {
@@ -5337,12 +5370,13 @@ static int active_load_balance_cpu_stop(void *data)
 	struct rq *busiest_rq = data;
 	int busiest_cpu = cpu_of(busiest_rq);
 	int target_cpu = busiest_rq->push_cpu;
-	int total_run_moved = 0;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
+	per_cpu(dbs_boost_load_moved, target_cpu) = 0;
+
 	/* make sure the requested cpu hasn't gone down in the meantime */
 	if (unlikely(busiest_cpu != smp_processor_id() ||
 		     !busiest_rq->active_balance))
@@ -5382,7 +5416,7 @@ static int active_load_balance_cpu_stop(void *data)
 
 		schedstat_inc(sd, alb_count);
 
-		if (move_one_task(&env, &total_run_moved))
+		if (move_one_task(&env))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
@@ -5395,13 +5429,16 @@ out_unlock:
 	if (per_cpu(dbs_boost_needed, target_cpu)) {
 		struct migration_notify_data mnd;
 
-		per_cpu(dbs_boost_needed, target_cpu) = false;
-
 		mnd.src_cpu = cpu_of(busiest_rq);
 		mnd.dest_cpu = target_cpu;
-		mnd.load = total_run_moved;
+		mnd.load = per_cpu(dbs_boost_load_moved, target_cpu);
+		if (mnd.load > 100)
+			mnd.load = 100;
 		atomic_notifier_call_chain(&migration_notifier_head,
 					   0, (void *)&mnd);
+
+		per_cpu(dbs_boost_needed, target_cpu) = false;
+		per_cpu(dbs_boost_load_moved, target_cpu) = 0;
 	}
 	return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 93e2dffe6ea0..0d58d4233501 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -12,7 +12,6 @@
 
 extern __read_mostly int scheduler_running;
 
-extern unsigned int sysctl_sched_ravg_window;
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -653,6 +652,23 @@ extern int group_balance_cpu(struct sched_group *sg);
 #include "stats.h"
 #include "auto_group.h"
 
+extern unsigned int sched_ravg_window;
+extern unsigned int pct_task_load(struct task_struct *p);
+extern void init_new_task_load(struct task_struct *p);
+
+static inline void
+inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+static inline void
+dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	rq->cumulative_runnable_avg -= p->ravg.demand;
+	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2e387e6c4067..39eba442e786 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -289,13 +289,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "sched_ravg_window",
-		.data		= &sysctl_sched_ravg_window,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 	{
 		.procname	= "sched_wakeup_load_threshold",
 		.data		= &sysctl_sched_wakeup_load_threshold,
-- 
GitLab