diff --color -rubN linux-5.7.6/include/linux/sched.h linux-5.7.6.cachy/include/linux/sched.h --- linux-5.7.6/include/linux/sched.h 2020-06-25 01:49:26.000000000 +1000 +++ linux-5.7.6.cachy/include/linux/sched.h 2020-07-24 17:51:45.879582847 +1000 @@ -452,9 +452,14 @@ /* For load-balancing: */ struct load_weight load; struct rb_node run_node; + + struct sched_entity* next[2]; + struct list_head group_node; unsigned int on_rq; + int quantom; + u64 exec_start; u64 sum_exec_runtime; u64 vruntime; @@ -464,16 +469,6 @@ struct sched_statistics statistics; -#ifdef CONFIG_FAIR_GROUP_SCHED - int depth; - struct sched_entity *parent; - /* rq on which this entity is (to be) queued: */ - struct cfs_rq *cfs_rq; - /* rq "owned" by this entity/group: */ - struct cfs_rq *my_q; - /* cached value of my_q->h_nr_running */ - unsigned long runnable_weight; -#endif #ifdef CONFIG_SMP /* diff --color -rubN linux-5.7.6/kernel/sched/core.c linux-5.7.6.cachy/kernel/sched/core.c --- linux-5.7.6/kernel/sched/core.c 2020-06-25 01:49:26.000000000 +1000 +++ linux-5.7.6.cachy/kernel/sched/core.c 2020-07-24 17:51:57.991504128 +1000 @@ -2672,18 +2672,14 @@ p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; - INIT_LIST_HEAD(&p->se.group_node); -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = NULL; -#endif + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif - RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); init_dl_inactive_task_timer(&p->dl); __dl_clear_params(p); @@ -3246,31 +3242,10 @@ #ifdef CONFIG_SMP -/* rq->lock is NOT held, but preemption is disabled */ -static void __balance_callback(struct rq *rq) -{ - struct callback_head *head, *next; - void (*func)(struct rq *rq); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - head = rq->balance_callback; - rq->balance_callback = NULL; - while (head) { - func = (void (*)(struct rq *))head->func; - next = head->next; - head->next = NULL; - head = next; - - func(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); -} +///* rq->lock is NOT held, but preemption is disabled */ static inline void balance_callback(struct rq *rq) { - if (unlikely(rq->balance_callback)) - __balance_callback(rq); } #else @@ -3606,7 +3581,6 @@ #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); - trigger_load_balance(rq); #endif } @@ -6574,23 +6548,12 @@ wait_bit_init(); -#ifdef CONFIG_FAIR_GROUP_SCHED - ptr += 2 * nr_cpu_ids * sizeof(void **); -#endif #ifdef CONFIG_RT_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); #endif if (ptr) { ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.se = (struct sched_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - -#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); @@ -6641,32 +6604,7 @@ init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - /* - * How much CPU bandwidth does root_task_group get? - * - * In case of task-groups formed thr' the cgroup filesystem, it - * gets 100% of the CPU resources in the system. This overall - * system CPU resource is divided among the tasks of - * root_task_group and its child task-groups in a fair manner, - * based on each entity's (task or task-group's) weight - * (se->load.weight). - * - * In other words, if root_task_group has 10 tasks of weight - * 1024) and two child groups A0 and A1 (of weight 1024 each), - * then A0's share of the CPU resource is: - * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% - * - * We achieve this by letting root_task_group's tasks sit - * directly in rq->cfs (i.e root_task_group->se[] = NULL). - */ - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); - init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); -#endif /* CONFIG_FAIR_GROUP_SCHED */ + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED diff --color -rubN linux-5.7.6/kernel/sched/debug.c linux-5.7.6.cachy/kernel/sched/debug.c --- linux-5.7.6/kernel/sched/debug.c 2020-06-25 01:49:26.000000000 +1000 +++ linux-5.7.6.cachy/kernel/sched/debug.c 2020-07-24 17:52:15.419390856 +1000 @@ -385,7 +385,7 @@ return; PN(se->exec_start); - PN(se->vruntime); + //PN(se->vruntime); PN(se->sum_exec_runtime); if (schedstat_enabled()) { @@ -437,9 +437,9 @@ else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, task_pid_nr(p), - SPLIT_NS(p->se.vruntime), + SEQ_printf(m, "%15s %5d %9d %9Ld %8d ", + p->comm, task_pid_nr(p), p->se.quantom, + //SPLIT_NS(p->se.vruntime),%9Ld.%06ld (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -464,9 +464,9 @@ SEQ_printf(m, "\n"); SEQ_printf(m, "runnable tasks:\n"); - SEQ_printf(m, " S task PID tree-key switches prio" + SEQ_printf(m, " S task PID quantom switches prio" " wait-time sum-exec sum-sleep\n"); - SEQ_printf(m, "-------------------------------------------------------" + SEQ_printf(m, "--------------------------------------------------------------------" "----------------------------------------------------\n"); rcu_read_lock(); @@ -481,10 +481,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; struct rq *rq = cpu_rq(cpu); - struct sched_entity *last; + //struct sched_entity *last; unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -498,26 +496,26 @@ SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (rb_first_cached(&cfs_rq->tasks_timeline)) - MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; - last = __pick_last_entity(cfs_rq); - if (last) - max_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; + //if (rb_first_cached(&cfs_rq->tasks_timeline)) + //MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; + //last = __pick_last_entity(cfs_rq); + //if (last) + //max_vruntime = last->vruntime; + //min_vruntime = cfs_rq->min_vruntime; + //rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; raw_spin_unlock_irqrestore(&rq->lock, flags); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", - SPLIT_NS(MIN_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", - SPLIT_NS(max_vruntime)); - spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", - SPLIT_NS(spread)); - spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", - SPLIT_NS(spread0)); + //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + //SPLIT_NS(MIN_vruntime)); + //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + //SPLIT_NS(min_vruntime)); + //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + //SPLIT_NS(max_vruntime)); + //spread = max_vruntime - MIN_vruntime; + //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + //SPLIT_NS(spread)); + //spread0 = min_vruntime - rq0_min_vruntime; + //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + //SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); @@ -875,7 +873,7 @@ #define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) PN(se.exec_start); - PN(se.vruntime); + //PN(se.vruntime); PN(se.sum_exec_runtime); nr_switches = p->nvcsw + p->nivcsw; diff --color -rubN linux-5.7.6/kernel/sched/fair.c linux-5.7.6.cachy/kernel/sched/fair.c --- linux-5.7.6/kernel/sched/fair.c 2020-06-25 01:49:26.000000000 +1000 +++ linux-5.7.6.cachy/kernel/sched/fair.c 2020-07-24 17:52:09.159431543 +1000 @@ -86,6 +86,9 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#define DIR_RIGHT 0 +#define DIR_LEFT 1 + int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -259,193 +262,6 @@ * CFS operations on generic schedulable entities: */ -#ifdef CONFIG_FAIR_GROUP_SCHED -static inline struct task_struct *task_of(struct sched_entity *se) -{ - SCHED_WARN_ON(!entity_is_task(se)); - return container_of(se, struct task_struct, se); -} - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - -static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); - - if (cfs_rq->on_list) - return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; - - cfs_rq->on_list = 1; - - /* - * Ensure we either appear before our parent (if already - * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases and a special case for the root - * cfs_rq. Furthermore, it also means that we will always reset - * tmp_alone_branch either when the branch is connected - * to a tree or when we reach the top of the tree - */ - if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { - /* - * If parent is already on the list, we add the child - * just before. Thanks to circular linked property of - * the list, this means to put the child at the tail - * of the list that starts by parent. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); - /* - * The branch is now connected to its tree so we can - * reset tmp_alone_branch to the beginning of the - * list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - return true; - } - - if (!cfs_rq->tg->parent) { - /* - * cfs rq without parent should be put - * at the tail of the list. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - /* - * We have reach the top of a tree so we can reset - * tmp_alone_branch to the beginning of the list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - return true; - } - - /* - * The parent has not already been added so we want to - * make sure that it will be put after us. - * tmp_alone_branch points to the begin of the branch - * where we will add parent. - */ - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); - /* - * update tmp_alone_branch to points to the new begin - * of the branch - */ - rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; - return false; -} - -static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->on_list) { - struct rq *rq = rq_of(cfs_rq); - - /* - * With cfs_rq being unthrottled/throttled during an enqueue, - * it can happen the tmp_alone_branch points the a leaf that - * we finally want to del. In this case, tmp_alone_branch moves - * to the prev element but it will point to rq->leaf_cfs_rq_list - * at the end of the enqueue. - */ - if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) - rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; - - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - cfs_rq->on_list = 0; - } -} - -static inline void assert_list_leaf_cfs_rq(struct rq *rq) -{ - SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); -} - -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ - list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ - leaf_cfs_rq_list) - -/* Do the two (enqueued) entities belong to the same group ? */ -static inline struct cfs_rq * -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - if (se->cfs_rq == pse->cfs_rq) - return se->cfs_rq; - - return NULL; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return se->parent; -} - -static void -find_matching_se(struct sched_entity **se, struct sched_entity **pse) -{ - int se_depth, pse_depth; - - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = (*se)->depth; - pse_depth = (*pse)->depth; - - while (se_depth > pse_depth) { - se_depth--; - *se = parent_entity(*se); - } - - while (pse_depth > se_depth) { - pse_depth--; - *pse = parent_entity(*pse); - } - - while (!is_same_group(*se, *pse)) { - *se = parent_entity(*se); - *pse = parent_entity(*pse); - } -} - -#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline struct task_struct *task_of(struct sched_entity *se) { @@ -506,138 +322,67 @@ { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); -/************************************************************** - * Scheduling class tree data structure manipulation methods: +/* + * Enqueue an entity */ - -static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - s64 delta = (s64)(vruntime - max_vruntime); - if (delta > 0) - max_vruntime = vruntime; + se->next[DIR_RIGHT] = NULL; + se->next[DIR_LEFT] = NULL; - return max_vruntime; -} + if (likely(cfs_rq->head)) + { + se->next[DIR_RIGHT] = cfs_rq->head; + cfs_rq->head->next[DIR_LEFT] = se; -static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) -{ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta < 0) - min_vruntime = vruntime; + // lastly reset the head + cfs_rq->head = se; - return min_vruntime; -} + return; + } -static inline int entity_before(struct sched_entity *a, - struct sched_entity *b) -{ - return (s64)(a->vruntime - b->vruntime) < 0; + // if empty rq + cfs_rq->head = se; } -static void update_min_vruntime(struct cfs_rq *cfs_rq) +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct sched_entity *curr = cfs_rq->curr; - struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); - - u64 vruntime = cfs_rq->min_vruntime; - if (curr) { - if (curr->on_rq) - vruntime = curr->vruntime; - else - curr = NULL; + // if only one se in rq + if (unlikely(cfs_rq->head->next[DIR_RIGHT] == NULL)) + cfs_rq->head = NULL; + else if (unlikely(se == cfs_rq->head)) + { + // if it is the head + cfs_rq->head = cfs_rq->head->next[DIR_RIGHT]; + cfs_rq->head->next[DIR_LEFT] = NULL; } - - if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); - - if (!curr) - vruntime = se->vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); - } - - /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); -#ifndef CONFIG_64BIT - smp_wmb(); - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} + { + // if in the middle + struct sched_entity *prev = se->next[DIR_LEFT]; + struct sched_entity *next = se->next[DIR_RIGHT]; -/* - * Enqueue an entity into the rb-tree: - */ -static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; + prev->next[DIR_RIGHT] = next; - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; + if (next) + next->next[DIR_LEFT] = prev; } - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); -} - -static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { - struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); - - if (!left) - return NULL; - - return rb_entry(left, struct sched_entity, run_node); -} - -static struct sched_entity *__pick_next_entity(struct sched_entity *se) -{ - struct rb_node *next = rb_next(&se->run_node); - - if (!next) - return NULL; - - return rb_entry(next, struct sched_entity, run_node); + return cfs_rq->head; } #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); - - if (!last) - return NULL; - - return rb_entry(last, struct sched_entity, run_node); + return cfs_rq->head; } /************************************************************** @@ -723,16 +468,6 @@ return slice; } -/* - * We calculate the vruntime slice of a to-be-inserted task. - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); -} - #include "pelt.h" #ifdef CONFIG_SMP @@ -856,6 +591,7 @@ return; curr->exec_start = now; + curr->quantom++; schedstat_set(curr->statistics.exec_max, max(delta_exec, curr->statistics.exec_max)); @@ -864,12 +600,10 @@ schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); - update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } @@ -2897,39 +2631,6 @@ } } -/* - * Drive the periodic memory faults.. - */ -static void task_tick_numa(struct rq *rq, struct task_struct *curr) -{ - struct callback_head *work = &curr->numa_work; - u64 period, now; - - /* - * We don't care about NUMA placement if we don't have memory. - */ - if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) - return; - - /* - * Using runtime rather than walltime has the dual advantage that - * we (mostly) drive the selection from busy threads and that the - * task needs to have done some actual work before we bother with - * NUMA placement. - */ - now = curr->se.sum_exec_runtime; - period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; - - if (now > curr->node_stamp + period) { - if (!curr->node_stamp) - curr->numa_scan_period = task_scan_start(curr); - curr->node_stamp += period; - - if (!time_before(jiffies, curr->mm->numa_next_scan)) - task_work_add(curr, work, true); - } -} - static void update_scan_period(struct task_struct *p, int new_cpu) { int src_nid = cpu_to_node(task_cpu(p)); @@ -2965,9 +2666,6 @@ } #else -static void task_tick_numa(struct rq *rq, struct task_struct *curr) -{ -} static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) { @@ -4072,50 +3770,9 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; - - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq->nr_spread_over); #endif } -static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -{ - u64 vruntime = cfs_rq->min_vruntime; - - /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. - */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); - - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh = sysctl_sched_latency; - - /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: - */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - - vruntime -= thresh; - } - - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); -} - -static void check_enqueue_throttle(struct cfs_rq *cfs_rq); - static inline void check_schedstat_required(void) { #ifdef CONFIG_SCHEDSTATS @@ -4171,28 +3828,11 @@ static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; - /* - * If we're the current task, we must renormalise before calling - * update_curr(). - */ - if (renorm && curr) - se->vruntime += cfs_rq->min_vruntime; - update_curr(cfs_rq); /* - * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. Being - * placed in the past could significantly boost this task to the - * fairness detriment of existing tasks. - */ - if (renorm && !curr) - se->vruntime += cfs_rq->min_vruntime; - - /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - Add its load to cfs_rq->runnable_avg @@ -4205,71 +3845,12 @@ update_cfs_group(se); account_entity_enqueue(cfs_rq, se); - if (flags & ENQUEUE_WAKEUP) - place_entity(cfs_rq, se, 0); - check_schedstat_required(); update_stats_enqueue(cfs_rq, se, flags); check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - - /* - * When bandwidth control is enabled, cfs might have been removed - * because of a parent been throttled but cfs->nr_running > 1. Try to - * add it unconditionnally. - */ - if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) - list_add_leaf_cfs_rq(cfs_rq); - - if (cfs_rq->nr_running == 1) - check_enqueue_throttle(cfs_rq); -} - -static void __clear_buddies_last(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last != se) - break; - - cfs_rq->last = NULL; - } -} - -static void __clear_buddies_next(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->next != se) - break; - - cfs_rq->next = NULL; - } -} - -static void __clear_buddies_skip(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip != se) - break; - - cfs_rq->skip = NULL; - } -} - -static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (cfs_rq->last == se) - __clear_buddies_last(se); - - if (cfs_rq->next == se) - __clear_buddies_next(se); - - if (cfs_rq->skip == se) - __clear_buddies_skip(se); } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -4295,75 +3876,15 @@ update_stats_dequeue(cfs_rq, se, flags); - clear_buddies(cfs_rq, se); - - if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; account_entity_dequeue(cfs_rq, se); - /* - * Normalize after update_curr(); which will also have moved - * min_vruntime if @se is the one holding it back. But before doing - * update_min_vruntime() again, which will discount @se's position and - * can move min_vruntime forward still more. - */ - if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; - /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); update_cfs_group(se); - - /* - * Now advance min_vruntime if @se was the entity holding it back, - * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be - * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- ie. we'll be penalized. - */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) - update_min_vruntime(cfs_rq); -} - -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; - s64 delta; - - ideal_runtime = sched_slice(cfs_rq, curr); - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); - return; - } - - /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. - */ - if (delta_exec < sysctl_sched_min_granularity) - return; - - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; - - if (delta < 0) - return; - - if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); } static void @@ -4371,96 +3892,18 @@ { /* 'current' is not kept within the tree. */ if (se->on_rq) { - /* - * Any task has to be enqueued before it get to execute on - * a CPU. So account for the time it spent waiting on the - * runqueue. - */ update_stats_wait_end(cfs_rq, se); - __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); cfs_rq->curr = se; - /* - * Track our maximum slice length, if the CPU's load is at - * least twice that of our own weight (i.e. dont track it - * when there are only lesser-weight tasks around): - */ - if (schedstat_enabled() && - rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, - max((u64)schedstat_val(se->statistics.slice_max), - se->sum_exec_runtime - se->prev_sum_exec_runtime)); - } - se->prev_sum_exec_runtime = se->sum_exec_runtime; } static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - -/* - * Pick the next process, keeping these things in mind, in this order: - * 1) keep things fair between processes/task groups - * 2) pick the "next" process, since someone really wants that to run - * 3) pick the "last" process, for cache locality - * 4) do not run the "skip" process, if something else is available - */ -static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - struct sched_entity *left = __pick_first_entity(cfs_rq); - struct sched_entity *se; - - /* - * If curr is set we have to see if its left of the leftmost entity - * still in the tree, provided there was anything in the tree at all. - */ - if (!left || (curr && entity_before(curr, left))) - left = curr; - - se = left; /* ideally we run the leftmost entity */ - - /* - * Avoid running the skip buddy, if running something else can - * be done without getting too unfair. - */ - if (cfs_rq->skip == se) { - struct sched_entity *second; - - if (se == curr) { - second = __pick_first_entity(cfs_rq); - } else { - second = __pick_next_entity(se); - if (!second || (curr && entity_before(curr, second))) - second = curr; - } - - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; - } - - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; - - clear_buddies(cfs_rq, se); - - return se; -} - -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { @@ -4471,21 +3914,19 @@ if (prev->on_rq) update_curr(cfs_rq); - /* throttle cfs_rqs exceeding runtime */ - check_cfs_rq_runtime(cfs_rq); - - check_spread(cfs_rq, prev); - if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); - /* Put 'current' back into the tree. */ - __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ update_load_avg(cfs_rq, prev, 0); } cfs_rq->curr = NULL; } +static int check_preempt_curr_fair(struct sched_entity *curr) +{ + return 1; +} + static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { @@ -4509,6 +3950,12 @@ resched_curr(rq_of(cfs_rq)); return; } + + if (check_preempt_curr_fair(curr) == 1) { + resched_curr(rq_of(cfs_rq)); + return; + } + /* * don't let the period tick interfere with the hrtick preemption */ @@ -4516,9 +3963,6 @@ hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif - - if (cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); } @@ -5082,30 +4526,6 @@ raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } -/* - * When a group wakes up we want to make sure that its quota is not already - * expired/exceeded, otherwise it may be allowed to steal additional ticks of - * runtime as update_curr() throttling can not not trigger until it's on-rq. - */ -static void check_enqueue_throttle(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return; - - /* an active group must be handled by the update_curr()->put() path */ - if (!cfs_rq->runtime_enabled || cfs_rq->curr) - return; - - /* ensure the group is not already throttled */ - if (cfs_rq_throttled(cfs_rq)) - return; - - /* update runtime allocation */ - account_cfs_rq_runtime(cfs_rq, 0); - if (cfs_rq->runtime_remaining <= 0) - throttle_cfs_rq(cfs_rq); -} - static void sync_throttle(struct task_group *tg, int cpu) { struct cfs_rq *pcfs_rq, *cfs_rq; @@ -5123,26 +4543,6 @@ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); } -/* conditionally throttle active cfs_rq's from put_prev_entity() */ -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - if (!cfs_bandwidth_used()) - return false; - - if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) - return false; - - /* - * it's possible for a throttled entity to be forced into a running - * state (e.g. set_curr_task), in this case we're finished. - */ - if (cfs_rq_throttled(cfs_rq)) - return true; - - throttle_cfs_rq(cfs_rq); - return true; -} - static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = @@ -5318,8 +4718,6 @@ } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} -static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } -static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -5548,8 +4946,6 @@ hrtick_update(rq); } -static void set_next_buddy(struct sched_entity *se); - /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and @@ -5578,12 +4974,6 @@ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); - /* - * Bias pick_next to pick a task from this cfs_rq, as - * p is sleeping when it is within its sched_slice. - */ - if (task_sleep && se && !throttled_hierarchy(cfs_rq)) - set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; @@ -5699,53 +5089,6 @@ return cpu_rq(cpu)->cpu_capacity; } -static void record_wakee(struct task_struct *p) -{ - /* - * Only decay a single time; tasks that have less then 1 wakeup per - * jiffy will not have built up many flips. - */ - if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { - current->wakee_flips >>= 1; - current->wakee_flip_decay_ts = jiffies; - } - - if (current->last_wakee != p) { - current->last_wakee = p; - current->wakee_flips++; - } -} - -/* - * Detect M:N waker/wakee relationships via a switching-frequency heuristic. - * - * A waker of many should wake a different task than the one last awakened - * at a frequency roughly N times higher than one of its wakees. - * - * In order to determine whether we should let the load spread vs consolidating - * to shared cache, we look for a minimum 'flip' frequency of llc_size in one - * partner, and a factor of lls_size higher frequency in the other. - * - * With both conditions met, we can be relatively sure that the relationship is - * non-monogamous, with partner count exceeding socket size. - * - * Waker/wakee being client/server, worker/dispatcher, interrupt source or - * whatever is irrelevant, spread criteria is apparent partner count exceeds - * socket size. - */ -static int wake_wide(struct task_struct *p) -{ - unsigned int master = current->wakee_flips; - unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); - - if (master < slave) - swap(master, slave); - if (slave < factor || master < slave * factor) - return 0; - return 1; -} - /* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous @@ -6402,238 +5745,6 @@ return min_t(unsigned long, util, capacity_orig_of(cpu)); } -/* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. - */ -static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) -{ - struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); - - /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. - */ - if (task_cpu(p) == cpu && dst_cpu != cpu) - sub_positive(&util, task_util(p)); - else if (task_cpu(p) != cpu && dst_cpu == cpu) - util += task_util(p); - - if (sched_feat(UTIL_EST)) { - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); - - /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util() after the task has been enqueued. - */ - if (dst_cpu == cpu) - util_est += _task_util_est(p); - - util = max(util, util_est); - } - - return min(util, capacity_orig_of(cpu)); -} - -/* - * compute_energy(): Estimates the energy that @pd would consume if @p was - * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of @pd's CPUs after the task migration, and uses the Energy Model - * to compute what would be the energy if we decided to actually migrate that - * task. - */ -static long -compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) -{ - struct cpumask *pd_mask = perf_domain_span(pd); - unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); - unsigned long max_util = 0, sum_util = 0; - int cpu; - - /* - * The capacity state of CPUs of the current rd can be driven by CPUs - * of another rd if they belong to the same pd. So, account for the - * utilization of these CPUs too by masking pd with cpu_online_mask - * instead of the rd span. - * - * If an entire pd is outside of the current rd, it will not appear in - * its pd list and will not be accounted by compute_energy(). - */ - for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { - unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); - struct task_struct *tsk = cpu == dst_cpu ? p : NULL; - - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, - ENERGY_UTIL, NULL); - - /* - * Performance domain frequency: utilization clamping - * must be considered since it affects the selection - * of the performance domain frequency. - * NOTE: in case RT tasks are running, by default the - * FREQUENCY_UTIL's utilization can be max OPP. - */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, - FREQUENCY_UTIL, tsk); - max_util = max(max_util, cpu_util); - } - - return em_pd_energy(pd->em_pd, max_util, sum_util); -} - -/* - * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the - * waking task. find_energy_efficient_cpu() looks for the CPU with maximum - * spare capacity in each performance domain and uses it as a potential - * candidate to execute the task. Then, it uses the Energy Model to figure - * out which of the CPU candidates is the most energy-efficient. - * - * The rationale for this heuristic is as follows. In a performance domain, - * all the most energy efficient CPU candidates (according to the Energy - * Model) are those for which we'll request a low frequency. When there are - * several CPUs for which the frequency request will be the same, we don't - * have enough data to break the tie between them, because the Energy Model - * only includes active power costs. With this model, if we assume that - * frequency requests follow utilization (e.g. using schedutil), the CPU with - * the maximum spare capacity in a performance domain is guaranteed to be among - * the best candidates of the performance domain. - * - * In practice, it could be preferable from an energy standpoint to pack - * small tasks on a CPU in order to let other CPUs go in deeper idle states, - * but that could also hurt our chances to go cluster idle, and we have no - * ways to tell with the current Energy Model if this is actually a good - * idea or not. So, find_energy_efficient_cpu() basically favors - * cluster-packing, and spreading inside a cluster. That should at least be - * a good thing for latency, and this is consistent with the idea that most - * of the energy savings of EAS come from the asymmetry of the system, and - * not so much from breaking the tie between identical CPUs. That's also the - * reason why EAS is enabled in the topology code only for systems where - * SD_ASYM_CPUCAPACITY is set. - * - * NOTE: Forkees are not accepted in the energy-aware wake-up path because - * they don't have any useful utilization data yet and it's not possible to - * forecast their impact on energy consumption. Consequently, they will be - * placed by find_idlest_cpu() on the least loaded CPU, which might turn out - * to be energy-inefficient in some use-cases. The alternative would be to - * bias new tasks towards specific types of CPUs first, or to try to infer - * their util_avg from the parent task, but those heuristics could hurt - * other use-cases too. So, until someone finds a better way to solve this, - * let's keep things simple by re-using the existing slow path. - */ -static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) -{ - unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - unsigned long cpu_cap, util, base_energy = 0; - int cpu, best_energy_cpu = prev_cpu; - struct sched_domain *sd; - struct perf_domain *pd; - - rcu_read_lock(); - pd = rcu_dereference(rd->pd); - if (!pd || READ_ONCE(rd->overutilized)) - goto fail; - - /* - * Energy-aware wake-up happens on the lowest sched_domain starting - * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. - */ - sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); - while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) - sd = sd->parent; - if (!sd) - goto fail; - - sync_entity_load_avg(&p->se); - if (!task_util_est(p)) - goto unlock; - - for (; pd; pd = pd->next) { - unsigned long cur_delta, spare_cap, max_spare_cap = 0; - unsigned long base_energy_pd; - int max_spare_cap_cpu = -1; - - /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, pd); - base_energy += base_energy_pd; - - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; - - util = cpu_util_next(cpu, p, cpu); - cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; - - /* - * Skip CPUs that cannot satisfy the capacity request. - * IOW, placing the task there would make the CPU - * overutilized. Take uclamp into account to see how - * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). - */ - util = uclamp_rq_util_with(cpu_rq(cpu), util, p); - if (!fits_capacity(util, cpu_cap)) - continue; - - /* Always use prev_cpu as a candidate. */ - if (cpu == prev_cpu) { - prev_delta = compute_energy(p, prev_cpu, pd); - prev_delta -= base_energy_pd; - best_delta = min(best_delta, prev_delta); - } - - /* - * Find the CPU with the maximum spare capacity in - * the performance domain - */ - if (spare_cap > max_spare_cap) { - max_spare_cap = spare_cap; - max_spare_cap_cpu = cpu; - } - } - - /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { - cur_delta = compute_energy(p, max_spare_cap_cpu, pd); - cur_delta -= base_energy_pd; - if (cur_delta < best_delta) { - best_delta = cur_delta; - best_energy_cpu = max_spare_cap_cpu; - } - } - } -unlock: - rcu_read_unlock(); - - /* - * Pick the best CPU if prev_cpu cannot be used, or if it saves at - * least 6% of the energy used by prev_cpu. - */ - if (prev_delta == ULONG_MAX) - return best_energy_cpu; - - if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) - return best_energy_cpu; - - return prev_cpu; - -fail: - rcu_read_unlock(); - - return -1; -} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -6656,19 +5767,6 @@ int want_affine = 0; int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); - if (sd_flag & SD_BALANCE_WAKE) { - record_wakee(p); - - if (sched_energy_enabled()) { - new_cpu = find_energy_efficient_cpu(p, prev_cpu); - if (new_cpu >= 0) - return new_cpu; - new_cpu = prev_cpu; - } - - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); - } - rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) @@ -6696,7 +5794,9 @@ if (unlikely(sd)) { /* Slow path */ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + } + + else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ /* Fast path */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); @@ -6718,59 +5818,6 @@ */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { - /* - * As blocked tasks retain absolute vruntime the migration needs to - * deal with this by subtracting the old and adding the new - * min_vruntime -- the latter is done by enqueue_entity() when placing - * the task on the new runqueue. - */ - if (p->state == TASK_WAKING) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 min_vruntime; - -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; - - do { - min_vruntime_copy = cfs_rq->min_vruntime_copy; - smp_rmb(); - min_vruntime = cfs_rq->min_vruntime; - } while (min_vruntime != min_vruntime_copy); -#else - min_vruntime = cfs_rq->min_vruntime; -#endif - - se->vruntime -= min_vruntime; - } - - if (p->on_rq == TASK_ON_RQ_MIGRATING) { - /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_held(&task_rq(p)->lock); - detach_entity_cfs_rq(&p->se); - - } else { - /* - * We are supposed to update the task to "current" time, then - * its up to date and ready to go to new CPU/cfs_rq. But we - * have difficulty in getting what current time is, so simply - * throw away the out-of-date time. This will result in the - * wakee task is less decayed, but giving the wakee more load - * sounds not bad. - */ - remove_entity_load_avg(&p->se); - } - - /* Tell new CPU we are migrated */ - p->se.avg.last_update_time = 0; - - /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; - - update_scan_period(p, new_cpu); } static void task_dead_fair(struct task_struct *p) @@ -6781,32 +5828,10 @@ static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - if (rq->nr_running) return 1; - - return newidle_balance(rq, rf) != 0; } #endif /* CONFIG_SMP */ -static unsigned long wakeup_gran(struct sched_entity *se) -{ - unsigned long gran = sysctl_sched_wakeup_granularity; - - /* - * Since its curr running now, convert the gran from real-time - * to virtual-time in his units. - * - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - return calc_delta_fair(gran, se); -} /* * Should 'se' preempt 'curr'. @@ -6817,54 +5842,43 @@ * g * |<--->|c * - * w(c, s1) = -1 + * w(c, s1) = -1 // don't preempt * w(c, s2) = 0 - * w(c, s3) = 1 + * w(c, s3) = 1 // preempt * */ static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff <= 0) - return -1; - - gran = wakeup_gran(se); - if (vdiff > gran) - return 1; - - return 0; -} - -static void set_last_buddy(struct sched_entity *se) +wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se) { - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; + u64 r_curr, r_se, w_curr, w_se; + struct task_struct *t_curr = task_of(curr); + struct task_struct *t_se = task_of(se); + u64 vr_curr = curr->sum_exec_runtime + 1; + u64 vr_se = se->sum_exec_runtime + 1; + s64 diff; + + w_curr = (now - t_curr->start_boottime) - vr_curr; + w_se = (now - t_se->start_boottime) - vr_se; + + w_curr *= (140 - t_curr->prio); + w_se *= (140 - t_se->prio); + + r_curr = w_curr / vr_curr; + r_se = w_se / vr_se; + diff = (s64)(r_se) - (s64)(r_curr); - for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) - return; - cfs_rq_of(se)->last = se; + if (diff == 0) + { + r_curr = w_curr % vr_curr; + r_se = w_se % vr_se; + diff = (s64)(r_se) - (s64)(r_curr); } -} -static void set_next_buddy(struct sched_entity *se) -{ - if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) - return; - for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) - return; - cfs_rq_of(se)->next = se; - } -} + if (diff > 0) + return 1; -static void set_skip_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; + return -1; } /* @@ -6874,28 +5888,12 @@ { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int scale = cfs_rq->nr_running >= sched_nr_latency; - int next_buddy_marked = 0; + u64 now = rq_clock_task(rq); if (unlikely(se == pse)) return; /* - * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_prempt_curr() after an enqueue (which may have - * lead to a throttle). This both saves work and prevents false - * next-buddy nomination below. - */ - if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) - return; - - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { - set_next_buddy(pse); - next_buddy_marked = 1; - } - - /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. * @@ -6923,13 +5921,7 @@ find_matching_se(&se, &pse); update_curr(cfs_rq_of(se)); BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is - * triggering this preemption. - */ - if (!next_buddy_marked) - set_next_buddy(pse); + if (wakeup_preempt_entity(now, se, pse) == 1) { goto preempt; } @@ -6948,113 +5940,36 @@ */ if (unlikely(!se->on_rq || curr == rq->idle)) return; - - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); } struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; - struct sched_entity *se; + struct sched_entity *se, *next; struct task_struct *p; - int new_tasks; + u64 now = rq_clock_task(rq); -again: - if (!sched_fair_runnable(rq)) + if (unlikely(!sched_fair_runnable(rq))) goto idle; -#ifdef CONFIG_FAIR_GROUP_SCHED - if (!prev || prev->sched_class != &fair_sched_class) - goto simple; - - /* - * Because of the set_next_buddy() in dequeue_task_fair() it is rather - * likely that a next task is from the same cgroup as the current. - * - * Therefore attempt to avoid putting and setting the entire cgroup - * hierarchy, only change the part that actually changes. - */ + se = next = cfs_rq->head; + next = next->next[DIR_RIGHT]; - do { - struct sched_entity *curr = cfs_rq->curr; - - /* - * Since we got here without doing put_prev_entity() we also - * have to consider cfs_rq->curr. If it is still a runnable - * entity, update_curr() will update its vruntime, otherwise - * forget we've ever seen it. - */ - if (curr) { - if (curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; - - /* - * This call to check_cfs_rq_runtime() will do the - * throttle and dequeue its entity in the parent(s). - * Therefore the nr_running test will indeed - * be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) { - cfs_rq = &rq->cfs; - - if (!cfs_rq->nr_running) - goto idle; - - goto simple; - } - } - - se = pick_next_entity(cfs_rq, curr); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); - - p = task_of(se); - - /* - * Since we haven't yet done put_prev_entity and if the selected task - * is a different task than we started out with, try and touch the - * least amount of cfs_rqs. - */ - if (prev != p) { - struct sched_entity *pse = &prev->se; - - while (!(cfs_rq = is_same_group(se, pse))) { - int se_depth = se->depth; - int pse_depth = pse->depth; - - if (se_depth <= pse_depth) { - put_prev_entity(cfs_rq_of(pse), pse); - pse = parent_entity(pse); - } - if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se); - se = parent_entity(se); - } - } + while (next) + { + if (wakeup_preempt_entity(now, se, next) == 1) + se = next; - put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se); + next = next->next[DIR_RIGHT]; } - goto done; -simple: -#endif - if (prev) - put_prev_task(rq, prev); - - do { - se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); - cfs_rq = group_cfs_rq(se); - } while (cfs_rq); p = task_of(se); -done: __maybe_unused; + se->quantom = 0; + #ifdef CONFIG_SMP /* * Move the next running task to the front of @@ -7075,19 +5990,6 @@ if (!rf) return NULL; - new_tasks = newidle_balance(rq, rf); - - /* - * Because newidle_balance() releases (and re-acquires) rq->lock, it is - * possible for any higher priority task to appear. In that case we - * must re-start the pick_next_entity() loop. - */ - if (new_tasks < 0) - return RETRY_TASK; - - if (new_tasks > 0) - goto again; - /* * rq is about to be idle, check if we need to update the * lost_idle_time of clock_pelt @@ -7125,7 +6027,6 @@ { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *se = &curr->se; /* * Are we the only task in the tree? @@ -7133,8 +6034,6 @@ if (unlikely(rq->nr_running == 1)) return; - clear_buddies(cfs_rq, se); - if (curr->policy != SCHED_BATCH) { update_rq_clock(rq); /* @@ -7148,8 +6047,6 @@ */ rq_clock_skip_update(rq); } - - set_skip_buddy(se); } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) @@ -7160,9 +6057,6 @@ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; - /* Tell the scheduler that we'd really like pse to run next. */ - set_next_buddy(se); - yield_task_fair(rq); return true; @@ -7370,39 +6264,6 @@ struct list_head tasks; }; -/* - * Is this task likely cache-hot: - */ -static int task_hot(struct task_struct *p, struct lb_env *env) -{ - s64 delta; - - lockdep_assert_held(&env->src_rq->lock); - - if (p->sched_class != &fair_sched_class) - return 0; - - if (unlikely(task_has_idle_policy(p))) - return 0; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) - return 1; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = rq_clock_task(env->src_rq) - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - #ifdef CONFIG_NUMA_BALANCING /* * Returns 1, if task migration degrades locality @@ -7463,302 +6324,10 @@ } #endif -/* - * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? - */ -static -int can_migrate_task(struct task_struct *p, struct lb_env *env) -{ - int tsk_cache_hot; - - lockdep_assert_held(&env->src_rq->lock); - - /* - * We do not migrate tasks that are: - * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_ptr, or - * 3) running (obviously), or - * 4) are cache-hot on their current CPU. - */ - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) - return 0; - - if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { - int cpu; - - schedstat_inc(p->se.statistics.nr_failed_migrations_affine); - - env->flags |= LBF_SOME_PINNED; - - /* - * Remember if this task can be migrated to any other CPU in - * our sched_group. We may want to revisit it if we couldn't - * meet load balance goals by pulling other tasks on src_cpu. - * - * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have - * already computed one in current iteration. - */ - if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) - return 0; - - /* Prevent to re-select dst_cpu via env's CPUs: */ - for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, p->cpus_ptr)) { - env->flags |= LBF_DST_PINNED; - env->new_dst_cpu = cpu; - break; - } - } - - return 0; - } - - /* Record that we found atleast one task that could run on dst_cpu */ - env->flags &= ~LBF_ALL_PINNED; - - if (task_running(env->src_rq, p)) { - schedstat_inc(p->se.statistics.nr_failed_migrations_running); - return 0; - } - - /* - * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. - */ - tsk_cache_hot = migrate_degrades_locality(p, env); - if (tsk_cache_hot == -1) - tsk_cache_hot = task_hot(p, env); - - if (tsk_cache_hot <= 0 || - env->sd->nr_balance_failed > env->sd->cache_nice_tries) { - if (tsk_cache_hot == 1) { - schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->se.statistics.nr_forced_migrations); - } - return 1; - } - - schedstat_inc(p->se.statistics.nr_failed_migrations_hot); - return 0; -} - -/* - * detach_task() -- detach the task for the migration specified in env - */ -static void detach_task(struct task_struct *p, struct lb_env *env) -{ - lockdep_assert_held(&env->src_rq->lock); - - deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); - set_task_cpu(p, env->dst_cpu); -} -/* - * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as - * part of active balancing operations within "domain". - * - * Returns a task if successful and NULL otherwise. - */ -static struct task_struct *detach_one_task(struct lb_env *env) -{ - struct task_struct *p; - - lockdep_assert_held(&env->src_rq->lock); - - list_for_each_entry_reverse(p, - &env->src_rq->cfs_tasks, se.group_node) { - if (!can_migrate_task(p, env)) - continue; - - detach_task(p, env); - - /* - * Right now, this is only the second place where - * lb_gained[env->idle] is updated (other is detach_tasks) - * so we can safely collect stats here rather than - * inside detach_tasks(). - */ - schedstat_inc(env->sd->lb_gained[env->idle]); - return p; - } - return NULL; -} static const unsigned int sched_nr_migrate_break = 32; -/* - * detach_tasks() -- tries to detach up to imbalance load/util/tasks from - * busiest_rq, as part of a balancing operation within domain "sd". - * - * Returns number of detached tasks if successful and 0 otherwise. - */ -static int detach_tasks(struct lb_env *env) -{ - struct list_head *tasks = &env->src_rq->cfs_tasks; - unsigned long util, load; - struct task_struct *p; - int detached = 0; - - lockdep_assert_held(&env->src_rq->lock); - - if (env->imbalance <= 0) - return 0; - - while (!list_empty(tasks)) { - /* - * We don't want to steal all, otherwise we may be treated likewise, - * which could at worst lead to a livelock crash. - */ - if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) - break; - - p = list_last_entry(tasks, struct task_struct, se.group_node); - - env->loop++; - /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) - break; - - /* take a breather every nr_migrate tasks */ - if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; - env->flags |= LBF_NEED_BREAK; - break; - } - - if (!can_migrate_task(p, env)) - goto next; - - switch (env->migration_type) { - case migrate_load: - load = task_h_load(p); - - if (sched_feat(LB_MIN) && - load < 16 && !env->sd->nr_balance_failed) - goto next; - - /* - * Make sure that we don't migrate too much load. - * Nevertheless, let relax the constraint if - * scheduler fails to find a good waiting task to - * migrate. - */ - if (load/2 > env->imbalance && - env->sd->nr_balance_failed <= env->sd->cache_nice_tries) - goto next; - - env->imbalance -= load; - break; - - case migrate_util: - util = task_util_est(p); - - if (util > env->imbalance) - goto next; - - env->imbalance -= util; - break; - - case migrate_task: - env->imbalance--; - break; - - case migrate_misfit: - /* This is not a misfit task */ - if (task_fits_capacity(p, capacity_of(env->src_cpu))) - goto next; - - env->imbalance = 0; - break; - } - - detach_task(p, env); - list_add(&p->se.group_node, &env->tasks); - - detached++; - -#ifdef CONFIG_PREEMPTION - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is detached to minimize - * the critical section. - */ - if (env->idle == CPU_NEWLY_IDLE) - break; -#endif - - /* - * We only want to steal up to the prescribed amount of - * load/util/tasks. - */ - if (env->imbalance <= 0) - break; - - continue; -next: - list_move(&p->se.group_node, tasks); - } - - /* - * Right now, this is one of only two places we collect this stat - * so we can safely collect detach_one_task() stats here rather - * than inside detach_one_task(). - */ - schedstat_add(env->sd->lb_gained[env->idle], detached); - - return detached; -} - -/* - * attach_task() -- attach the task detached by detach_task() to its new rq. - */ -static void attach_task(struct rq *rq, struct task_struct *p) -{ - lockdep_assert_held(&rq->lock); - - BUG_ON(task_rq(p) != rq); - activate_task(rq, p, ENQUEUE_NOCLOCK); - check_preempt_curr(rq, p, 0); -} - -/* - * attach_one_task() -- attaches the task returned from detach_one_task() to - * its new rq. - */ -static void attach_one_task(struct rq *rq, struct task_struct *p) -{ - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - attach_task(rq, p); - rq_unlock(rq, &rf); -} - -/* - * attach_tasks() -- attaches all tasks detached by detach_tasks() to their - * new rq. - */ -static void attach_tasks(struct lb_env *env) -{ - struct list_head *tasks = &env->tasks; - struct task_struct *p; - struct rq_flags rf; - - rq_lock(env->dst_rq, &rf); - update_rq_clock(env->dst_rq); - - while (!list_empty(tasks)) { - p = list_first_entry(tasks, struct task_struct, se.group_node); - list_del_init(&p->se.group_node); - - attach_task(env->dst_rq, p); - } - - rq_unlock(env->dst_rq, &rf); -} #ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) @@ -9086,293 +7655,6 @@ ) / SCHED_CAPACITY_SCALE; } -/******* find_busiest_group() helpers end here *********************/ - -/* - * Decision matrix according to the local and busiest group type: - * - * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded - * has_spare nr_idle balanced N/A N/A balanced balanced - * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force - * asym_packing force force N/A N/A force force - * imbalanced force force N/A N/A force force - * overloaded force force N/A N/A force avg_load - * - * N/A : Not Applicable because already filtered while updating - * statistics. - * balanced : The system is balanced for these 2 groups. - * force : Calculate the imbalance as load migration is probably needed. - * avg_load : Only if imbalance is significant enough. - * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite - * different in groups. - */ - -/** - * find_busiest_group - Returns the busiest group within the sched_domain - * if there is an imbalance. - * - * Also calculates the amount of runnable load which should be moved - * to restore balance. - * - * @env: The load balancing environment. - * - * Return: - The busiest group if imbalance exists. - */ -static struct sched_group *find_busiest_group(struct lb_env *env) -{ - struct sg_lb_stats *local, *busiest; - struct sd_lb_stats sds; - - init_sd_lb_stats(&sds); - - /* - * Compute the various statistics relevant for load balancing at - * this level. - */ - update_sd_lb_stats(env, &sds); - - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } - - local = &sds.local_stat; - busiest = &sds.busiest_stat; - - /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest) - goto out_balanced; - - /* Misfit tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) - goto force_balance; - - /* ASYM feature bypasses nice load balance check */ - if (busiest->group_type == group_asym_packing) - goto force_balance; - - /* - * If the busiest group is imbalanced the below checks don't - * work because they assume all things are equal, which typically - * isn't true due to cpus_ptr constraints and the like. - */ - if (busiest->group_type == group_imbalanced) - goto force_balance; - - /* - * If the local group is busier than the selected busiest group - * don't try and pull any tasks. - */ - if (local->group_type > busiest->group_type) - goto out_balanced; - - /* - * When groups are overloaded, use the avg_load to ensure fairness - * between tasks. - */ - if (local->group_type == group_overloaded) { - /* - * If the local group is more loaded than the selected - * busiest group don't try to pull any tasks. - */ - if (local->avg_load >= busiest->avg_load) - goto out_balanced; - - /* XXX broken for overlapping NUMA groups */ - sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / - sds.total_capacity; - - /* - * Don't pull any tasks if this group is already above the - * domain average load. - */ - if (local->avg_load >= sds.avg_load) - goto out_balanced; - - /* - * If the busiest group is more loaded, use imbalance_pct to be - * conservative. - */ - if (100 * busiest->avg_load <= - env->sd->imbalance_pct * local->avg_load) - goto out_balanced; - } - - /* Try to move all excess tasks to child's sibling domain */ - if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_nr_running > local->sum_nr_running + 1) - goto force_balance; - - if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) - /* - * If the busiest group is not overloaded (and as a - * result the local one too) but this CPU is already - * busy, let another idle CPU try to pull task. - */ - goto out_balanced; - - if (busiest->group_weight > 1 && - local->idle_cpus <= (busiest->idle_cpus + 1)) - /* - * If the busiest group is not overloaded - * and there is no imbalance between this and busiest - * group wrt idle CPUs, it is balanced. The imbalance - * becomes significant if the diff is greater than 1 - * otherwise we might end up to just move the imbalance - * on another group. Of course this applies only if - * there is more than 1 CPU per group. - */ - goto out_balanced; - - if (busiest->sum_h_nr_running == 1) - /* - * busiest doesn't have any tasks waiting to run - */ - goto out_balanced; - } - -force_balance: - /* Looks like there is an imbalance. Compute it */ - calculate_imbalance(env, &sds); - return env->imbalance ? sds.busiest : NULL; - -out_balanced: - env->imbalance = 0; - return NULL; -} - -/* - * find_busiest_queue - find the busiest runqueue among the CPUs in the group. - */ -static struct rq *find_busiest_queue(struct lb_env *env, - struct sched_group *group) -{ - struct rq *busiest = NULL, *rq; - unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; - unsigned int busiest_nr = 0; - int i; - - for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, load, util; - unsigned int nr_running; - enum fbq_type rt; - - rq = cpu_rq(i); - rt = fbq_classify_rq(rq); - - /* - * We classify groups/runqueues into three groups: - * - regular: there are !numa tasks - * - remote: there are numa tasks that run on the 'wrong' node - * - all: there is no distinction - * - * In order to avoid migrating ideally placed numa tasks, - * ignore those when there's better options. - * - * If we ignore the actual busiest queue to migrate another - * task, the next balance pass can still reduce the busiest - * queue by moving tasks around inside the node. - * - * If we cannot move enough load due to this classification - * the next pass will adjust the group classification and - * allow migration of more tasks. - * - * Both cases only affect the total convergence complexity. - */ - if (rt > env->fbq_type) - continue; - - capacity = capacity_of(i); - nr_running = rq->cfs.h_nr_running; - - /* - * For ASYM_CPUCAPACITY domains, don't pick a CPU that could - * eventually lead to active_balancing high->low capacity. - * Higher per-CPU capacity is considered better than balancing - * average load. - */ - if (env->sd->flags & SD_ASYM_CPUCAPACITY && - capacity_of(env->dst_cpu) < capacity && - nr_running == 1) - continue; - - switch (env->migration_type) { - case migrate_load: - /* - * When comparing with load imbalance, use cpu_load() - * which is not scaled with the CPU capacity. - */ - load = cpu_load(rq); - - if (nr_running == 1 && load > env->imbalance && - !check_cpu_capacity(rq, env->sd)) - break; - - /* - * For the load comparisons with the other CPUs, - * consider the cpu_load() scaled with the CPU - * capacity, so that the load can be moved away - * from the CPU that is potentially running at a - * lower capacity. - * - * Thus we're looking for max(load_i / capacity_i), - * crosswise multiplication to rid ourselves of the - * division works out to: - * load_i * capacity_j > load_j * capacity_i; - * where j is our previous maximum. - */ - if (load * busiest_capacity > busiest_load * capacity) { - busiest_load = load; - busiest_capacity = capacity; - busiest = rq; - } - break; - - case migrate_util: - util = cpu_util(cpu_of(rq)); - - /* - * Don't try to pull utilization from a CPU with one - * running task. Whatever its utilization, we will fail - * detach the task. - */ - if (nr_running <= 1) - continue; - - if (busiest_util < util) { - busiest_util = util; - busiest = rq; - } - break; - - case migrate_task: - if (busiest_nr < nr_running) { - busiest_nr = nr_running; - busiest = rq; - } - break; - - case migrate_misfit: - /* - * For ASYM_CPUCAPACITY domains with misfit tasks we - * simply seek the "biggest" misfit task. - */ - if (rq->misfit_task_load > busiest_load) { - busiest_load = rq->misfit_task_load; - busiest = rq; - } - - break; - - } - } - - return busiest; -} /* * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but @@ -9419,334 +7701,6 @@ return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - -static int active_load_balance_cpu_stop(void *data); - -static int should_we_balance(struct lb_env *env) -{ - struct sched_group *sg = env->sd->groups; - int cpu, balance_cpu = -1; - - /* - * Ensure the balancing environment is consistent; can happen - * when the softirq triggers 'during' hotplug. - */ - if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) - return 0; - - /* - * In the newly idle case, we will allow all the CPUs - * to do the newly idle load balance. - */ - if (env->idle == CPU_NEWLY_IDLE) - return 1; - - /* Try to find first idle CPU */ - for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { - if (!idle_cpu(cpu)) - continue; - - balance_cpu = cpu; - break; - } - - if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); - - /* - * First idle CPU or the first CPU(busiest) in this sched group - * is eligible for doing load balancing at this and above domains. - */ - return balance_cpu == env->dst_cpu; -} - -/* - * Check this_cpu to ensure it is balanced within domain. Attempt to move - * tasks if there is an imbalance. - */ -static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum cpu_idle_type idle, - int *continue_balancing) -{ - int ld_moved, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; - struct sched_group *group; - struct rq *busiest; - struct rq_flags rf; - struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - - struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .dst_grpmask = sched_group_span(sd->groups), - .idle = idle, - .loop_break = sched_nr_migrate_break, - .cpus = cpus, - .fbq_type = all, - .tasks = LIST_HEAD_INIT(env.tasks), - }; - - cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); - - schedstat_inc(sd->lb_count[idle]); - -redo: - if (!should_we_balance(&env)) { - *continue_balancing = 0; - goto out_balanced; - } - - group = find_busiest_group(&env); - if (!group) { - schedstat_inc(sd->lb_nobusyg[idle]); - goto out_balanced; - } - - busiest = find_busiest_queue(&env, group); - if (!busiest) { - schedstat_inc(sd->lb_nobusyq[idle]); - goto out_balanced; - } - - BUG_ON(busiest == env.dst_rq); - - schedstat_add(sd->lb_imbalance[idle], env.imbalance); - - env.src_cpu = busiest->cpu; - env.src_rq = busiest; - - ld_moved = 0; - if (busiest->nr_running > 1) { - /* - * Attempt to move tasks. If find_busiest_group has found - * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. ld_moved simply stays zero, so it is - * correctly treated as an imbalance. - */ - env.flags |= LBF_ALL_PINNED; - env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); - -more_balance: - rq_lock_irqsave(busiest, &rf); - update_rq_clock(busiest); - - /* - * cur_ld_moved - load moved in current iteration - * ld_moved - cumulative load moved across iterations - */ - cur_ld_moved = detach_tasks(&env); - - /* - * We've detached some tasks from busiest_rq. Every - * task is masked "TASK_ON_RQ_MIGRATING", so we can safely - * unlock busiest->lock, and we are able to be sure - * that nobody can manipulate the tasks in parallel. - * See task_rq_lock() family for the details. - */ - - rq_unlock(busiest, &rf); - - if (cur_ld_moved) { - attach_tasks(&env); - ld_moved += cur_ld_moved; - } - - local_irq_restore(rf.flags); - - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - - /* - * Revisit (affine) tasks on src_cpu that couldn't be moved to - * us and move them to an alternate dst_cpu in our sched_group - * where they can run. The upper limit on how many times we - * iterate on same src_cpu is dependent on number of CPUs in our - * sched_group. - * - * This changes load balance semantics a bit on who can move - * load to a given_cpu. In addition to the given_cpu itself - * (or a ilb_cpu acting on its behalf where given_cpu is - * nohz-idle), we now have balance_cpu in a position to move - * load to given_cpu. In rare situations, this may cause - * conflicts (balance_cpu and given_cpu/ilb_cpu deciding - * _independently_ and at _same_ time to move some load to - * given_cpu) causing exceess load to be moved to given_cpu. - * This however should not happen so much in practice and - * moreover subsequent load balance cycles should correct the - * excess load moved. - */ - if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { - - /* Prevent to re-select dst_cpu via env's CPUs */ - __cpumask_clear_cpu(env.dst_cpu, env.cpus); - - env.dst_rq = cpu_rq(env.new_dst_cpu); - env.dst_cpu = env.new_dst_cpu; - env.flags &= ~LBF_DST_PINNED; - env.loop = 0; - env.loop_break = sched_nr_migrate_break; - - /* - * Go back to "more_balance" rather than "redo" since we - * need to continue with same src_cpu. - */ - goto more_balance; - } - - /* - * We failed to reach balance because of affinity. - */ - if (sd_parent) { - int *group_imbalance = &sd_parent->groups->sgc->imbalance; - - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) - *group_imbalance = 1; - } - - /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(env.flags & LBF_ALL_PINNED)) { - __cpumask_clear_cpu(cpu_of(busiest), cpus); - /* - * Attempting to continue load balancing at the current - * sched_domain level only makes sense if there are - * active CPUs remaining as possible busiest CPUs to - * pull load from which are not contained within the - * destination group that is receiving any migrated - * load. - */ - if (!cpumask_subset(cpus, env.dst_grpmask)) { - env.loop = 0; - env.loop_break = sched_nr_migrate_break; - goto redo; - } - goto out_all_pinned; - } - } - - if (!ld_moved) { - schedstat_inc(sd->lb_failed[idle]); - /* - * Increment the failure counter only on periodic balance. - * We do not want newidle balance, which can be very - * frequent, pollute the failure counter causing - * excessive cache_hot migrations and active balances. - */ - if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; - - if (need_active_balance(&env)) { - unsigned long flags; - - raw_spin_lock_irqsave(&busiest->lock, flags); - - /* - * Don't kick the active_load_balance_cpu_stop, - * if the curr task on busiest CPU can't be - * moved to this_cpu: - */ - if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { - raw_spin_unlock_irqrestore(&busiest->lock, - flags); - env.flags |= LBF_ALL_PINNED; - goto out_one_pinned; - } - - /* - * ->active_balance synchronizes accesses to - * ->active_balance_work. Once set, it's cleared - * only after active load balance is finished. - */ - if (!busiest->active_balance) { - busiest->active_balance = 1; - busiest->push_cpu = this_cpu; - active_balance = 1; - } - raw_spin_unlock_irqrestore(&busiest->lock, flags); - - if (active_balance) { - stop_one_cpu_nowait(cpu_of(busiest), - active_load_balance_cpu_stop, busiest, - &busiest->active_balance_work); - } - - /* We've kicked active balancing, force task migration. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; - } - } else - sd->nr_balance_failed = 0; - - if (likely(!active_balance) || voluntary_active_balance(&env)) { - /* We were unbalanced, so reset the balancing interval */ - sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; - } - - goto out; - -out_balanced: - /* - * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag only if other tasks got - * a chance to move and fix the imbalance. - */ - if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { - int *group_imbalance = &sd_parent->groups->sgc->imbalance; - - if (*group_imbalance) - *group_imbalance = 0; - } - -out_all_pinned: - /* - * We reach balance because all tasks are pinned at this level so - * we can't migrate them. Let the imbalance flag set so parent level - * can try to migrate them. - */ - schedstat_inc(sd->lb_balanced[idle]); - - sd->nr_balance_failed = 0; - -out_one_pinned: - ld_moved = 0; - - /* - * newidle_balance() disregards balance intervals, so we could - * repeatedly reach this code, which would lead to balance_interval - * skyrocketting in a short amount of time. Skip the balance_interval - * increase logic to avoid that. - */ - if (env.idle == CPU_NEWLY_IDLE) - goto out; - - /* tune up the balancing interval */ - if ((env.flags & LBF_ALL_PINNED && - sd->balance_interval < MAX_PINNED_INTERVAL) || - sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; -out: - return ld_moved; -} - static inline unsigned long get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) { @@ -9776,99 +7730,6 @@ } /* - * active_load_balance_cpu_stop is run by the CPU stopper. It pushes - * running tasks off the busiest CPU onto idle CPUs. It requires at - * least 1 task to be running on each physical CPU where possible, and - * avoids physical / logical imbalances. - */ -static int active_load_balance_cpu_stop(void *data) -{ - struct rq *busiest_rq = data; - int busiest_cpu = cpu_of(busiest_rq); - int target_cpu = busiest_rq->push_cpu; - struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; - struct task_struct *p = NULL; - struct rq_flags rf; - - rq_lock_irq(busiest_rq, &rf); - /* - * Between queueing the stop-work and running it is a hole in which - * CPUs can become inactive. We should not move tasks from or to - * inactive CPUs. - */ - if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) - goto out_unlock; - - /* Make sure the requested CPU hasn't gone down in the meantime: */ - if (unlikely(busiest_cpu != smp_processor_id() || - !busiest_rq->active_balance)) - goto out_unlock; - - /* Is there any task to move? */ - if (busiest_rq->nr_running <= 1) - goto out_unlock; - - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-CPU setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* Search for an sd spanning us and the target CPU. */ - rcu_read_lock(); - for_each_domain(target_cpu, sd) { - if ((sd->flags & SD_LOAD_BALANCE) && - cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) - break; - } - - if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - /* - * can_migrate_task() doesn't need to compute new_dst_cpu - * for active balancing. Since we have CPU_IDLE, but no - * @dst_grpmask we need to make that test go away with lying - * about DST_PINNED. - */ - .flags = LBF_DST_PINNED, - }; - - schedstat_inc(sd->alb_count); - update_rq_clock(busiest_rq); - - p = detach_one_task(&env); - if (p) { - schedstat_inc(sd->alb_pushed); - /* Active balancing done, reset the failure counter. */ - sd->nr_balance_failed = 0; - } else { - schedstat_inc(sd->alb_failed); - } - } - rcu_read_unlock(); -out_unlock: - busiest_rq->active_balance = 0; - rq_unlock(busiest_rq, &rf); - - if (p) - attach_one_task(target_rq, p); - - local_irq_enable(); - - return 0; -} - -static DEFINE_SPINLOCK(balancing); - -/* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ @@ -9877,114 +7738,6 @@ max_load_balance_interval = HZ*num_online_cpus()/10; } -/* - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in init_sched_domains. - */ -static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) -{ - int continue_balancing = 1; - int cpu = rq->cpu; - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); - unsigned long interval; - struct sched_domain *sd; - /* Earliest time when we have to do rebalance again */ - unsigned long next_balance = jiffies + 60*HZ; - int update_next_balance = 0; - int need_serialize, need_decay = 0; - u64 max_cost = 0; - - rcu_read_lock(); - for_each_domain(cpu, sd) { - /* - * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. - */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } - max_cost += sd->max_newidle_lb_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!continue_balancing) { - if (need_decay) - continue; - break; - } - - interval = get_sd_balance_interval(sd, busy); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (!spin_trylock(&balancing)) - goto out; - } - - if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { - /* - * The LBF_DST_PINNED logic could have changed - * env->dst_cpu, so we can't know our idle - * state even if we migrated tasks. Update it. - */ - idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; - busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); - } - sd->last_balance = jiffies; - interval = get_sd_balance_interval(sd, busy); - } - if (need_serialize) - spin_unlock(&balancing); -out: - if (time_after(next_balance, sd->last_balance + interval)) { - next_balance = sd->last_balance + interval; - update_next_balance = 1; - } - } - if (need_decay) { - /* - * Ensure the rq-wide value also decays but keep it at a - * reasonable floor to avoid funnies with rq->avg_idle. - */ - rq->max_idle_balance_cost = - max((u64)sysctl_sched_migration_cost, max_cost); - } - rcu_read_unlock(); - - /* - * next_balance will be updated only when there is a need. - * When the cpu is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) { - rq->next_balance = next_balance; - -#ifdef CONFIG_NO_HZ_COMMON - /* - * If this CPU has been elected to perform the nohz idle - * balance. Other idle CPUs have already rebalanced with - * nohz_idle_balance() and nohz.next_balance has been - * updated accordingly. This CPU is now running the idle load - * balance for itself and we need to update the - * nohz.next_balance accordingly. - */ - if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) - nohz.next_balance = rq->next_balance; -#endif - } -} static inline int on_null_domain(struct rq *rq) { @@ -10014,420 +7767,12 @@ return nr_cpu_ids; } -/* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). - */ -static void kick_ilb(unsigned int flags) -{ - int ilb_cpu; - - nohz.next_balance++; - - ilb_cpu = find_new_ilb(); - - if (ilb_cpu >= nr_cpu_ids) - return; - - flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); - if (flags & NOHZ_KICK_MASK) - return; - - /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target CPU which - * is idle. And the softirq performing nohz idle load balance - * will be run before returning from the IPI. - */ - smp_send_reschedule(ilb_cpu); -} - -/* - * Current decision point for kicking the idle load balancer in the presence - * of idle CPUs in the system. - */ -static void nohz_balancer_kick(struct rq *rq) -{ - unsigned long now = jiffies; - struct sched_domain_shared *sds; - struct sched_domain *sd; - int nr_busy, i, cpu = rq->cpu; - unsigned int flags = 0; - - if (unlikely(rq->idle_balance)) - return; - - /* - * We may be recently in ticked or tickless idle mode. At the first - * busy tick after returning from idle, we will update the busy stats. - */ - nohz_balance_exit_idle(rq); - - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return; - - if (READ_ONCE(nohz.has_blocked) && - time_after(now, READ_ONCE(nohz.next_blocked))) - flags = NOHZ_STATS_KICK; - - if (time_before(now, nohz.next_balance)) - goto out; - - if (rq->nr_running >= 2) { - flags = NOHZ_KICK_MASK; - goto out; - } - - rcu_read_lock(); - - sd = rcu_dereference(rq->sd); - if (sd) { - /* - * If there's a CFS task and the current CPU has reduced - * capacity; kick the ILB to see if there's a better CPU to run - * on. - */ - if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - } - - sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); - if (sd) { - /* - * When ASYM_PACKING; see if there's a more preferred CPU - * currently idle; in which case, kick the ILB to move tasks - * around. - */ - for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_asym_prefer(i, cpu)) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - } - } - - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); - if (sd) { - /* - * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU - * to run the misfit task on. - */ - if (check_misfit_status(rq, sd)) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - - /* - * For asymmetric systems, we do not want to nicely balance - * cache use, instead we want to embrace asymmetry and only - * ensure tasks have enough CPU capacity. - * - * Skip the LLC logic because it's not relevant in that case. - */ - goto unlock; - } - - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { - /* - * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread - * load within the current LLC domain (e.g. packed SMT cores but - * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks - * like this LLC domain has tasks we could move. - */ - nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - } -unlock: - rcu_read_unlock(); -out: - if (flags) - kick_ilb(flags); -} - -static void set_cpu_sd_state_busy(int cpu) -{ - struct sched_domain *sd; - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || !sd->nohz_idle) - goto unlock; - sd->nohz_idle = 0; - - atomic_inc(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - void nohz_balance_exit_idle(struct rq *rq) { - SCHED_WARN_ON(rq != this_rq()); - - if (likely(!rq->nohz_tick_stopped)) - return; - - rq->nohz_tick_stopped = 0; - cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - - set_cpu_sd_state_busy(rq->cpu); } -static void set_cpu_sd_state_idle(int cpu) -{ - struct sched_domain *sd; - - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - - if (!sd || sd->nohz_idle) - goto unlock; - sd->nohz_idle = 1; - - atomic_dec(&sd->shared->nr_busy_cpus); -unlock: - rcu_read_unlock(); -} - -/* - * This routine will record that the CPU is going idle with tick stopped. - * This info will be used in performing idle load balancing in the future. - */ void nohz_balance_enter_idle(int cpu) { - struct rq *rq = cpu_rq(cpu); - - SCHED_WARN_ON(cpu != smp_processor_id()); - - /* If this CPU is going down, then nothing needs to be done: */ - if (!cpu_active(cpu)) - return; - - /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) - return; - - /* - * Can be set safely without rq->lock held - * If a clear happens, it will have evaluated last additions because - * rq->lock is held during the check and the clear - */ - rq->has_blocked_load = 1; - - /* - * The tick is still stopped but load could have been added in the - * meantime. We set the nohz.has_blocked flag to trig a check of the - * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear - * of nohz.has_blocked can only happen after checking the new load - */ - if (rq->nohz_tick_stopped) - goto out; - - /* If we're a completely isolated CPU, we don't play: */ - if (on_null_domain(rq)) - return; - - rq->nohz_tick_stopped = 1; - - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - atomic_inc(&nohz.nr_cpus); - - /* - * Ensures that if nohz_idle_balance() fails to observe our - * @idle_cpus_mask store, it must observe the @has_blocked - * store. - */ - smp_mb__after_atomic(); - - set_cpu_sd_state_idle(cpu); - -out: - /* - * Each time a cpu enter idle, we assume that it has blocked load and - * enable the periodic update of the load of idle cpus - */ - WRITE_ONCE(nohz.has_blocked, 1); -} - -/* - * Internal function that runs load balance for all idle cpus. The load balance - * can be a simple update of blocked load or a complete load balance with - * tasks movement depending of flags. - * The function returns false if the loop has stopped before running - * through all idle CPUs. - */ -static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) -{ - /* Earliest time when we have to do rebalance again */ - unsigned long now = jiffies; - unsigned long next_balance = now + 60*HZ; - bool has_blocked_load = false; - int update_next_balance = 0; - int this_cpu = this_rq->cpu; - int balance_cpu; - int ret = false; - struct rq *rq; - - SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); - - /* - * We assume there will be no idle load after this update and clear - * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trig another update of idle load. - * Because a cpu that becomes idle, is added to idle_cpus_mask before - * setting the flag, we are sure to not clear the state and not - * check the load of an idle cpu. - */ - WRITE_ONCE(nohz.has_blocked, 0); - - /* - * Ensures that if we miss the CPU, we must see the has_blocked - * store from nohz_balance_enter_idle(). - */ - smp_mb(); - - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) - continue; - - /* - * If this CPU gets work to do, stop the load balancing - * work being done for other CPUs. Next load - * balancing owner will pick it up. - */ - if (need_resched()) { - has_blocked_load = true; - goto abort; - } - - rq = cpu_rq(balance_cpu); - - has_blocked_load |= update_nohz_stats(rq, true); - - /* - * If time for next balance is due, - * do the balance. - */ - if (time_after_eq(jiffies, rq->next_balance)) { - struct rq_flags rf; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); - rq_unlock_irqrestore(rq, &rf); - - if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(rq, CPU_IDLE); - } - - if (time_after(next_balance, rq->next_balance)) { - next_balance = rq->next_balance; - update_next_balance = 1; - } - } - - /* Newly idle CPU doesn't need an update */ - if (idle != CPU_NEWLY_IDLE) { - update_blocked_averages(this_cpu); - has_blocked_load |= this_rq->has_blocked_load; - } - - if (flags & NOHZ_BALANCE_KICK) - rebalance_domains(this_rq, CPU_IDLE); - - WRITE_ONCE(nohz.next_blocked, - now + msecs_to_jiffies(LOAD_AVG_PERIOD)); - - /* The full idle balance loop has been done */ - ret = true; - -abort: - /* There is still blocked load, enable periodic update */ - if (has_blocked_load) - WRITE_ONCE(nohz.has_blocked, 1); - - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - - return ret; -} - -/* - * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the - * rebalancing for all the cpus for whom scheduler ticks are stopped. - */ -static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) -{ - int this_cpu = this_rq->cpu; - unsigned int flags; - - if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) - return false; - - if (idle != CPU_IDLE) { - atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - return false; - } - - /* could be _relaxed() */ - flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); - if (!(flags & NOHZ_KICK_MASK)) - return false; - - _nohz_idle_balance(this_rq, flags, idle); - - return true; -} - -static void nohz_newidle_balance(struct rq *this_rq) -{ - int this_cpu = this_rq->cpu; - - /* - * This CPU doesn't want to be disturbed by scheduler - * housekeeping - */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) - return; - - /* Will wake up very soon. No time for doing anything else*/ - if (this_rq->avg_idle < sysctl_sched_migration_cost) - return; - - /* Don't need to update blocked load of idle CPUs*/ - if (!READ_ONCE(nohz.has_blocked) || - time_before(jiffies, READ_ONCE(nohz.next_blocked))) - return; - - raw_spin_unlock(&this_rq->lock); - /* - * This CPU is going to be idle and blocked load of idle CPUs - * need to be updated. Run the ilb locally as it is a good - * candidate for ilb instead of waking up another idle CPU. - * Kick an normal ilb if we failed to do the update. - */ - if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) - kick_ilb(NOHZ_STATS_KICK); - raw_spin_lock(&this_rq->lock); } #else /* !CONFIG_NO_HZ_COMMON */ @@ -10441,169 +7786,6 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ -/* - * idle_balance is called by schedule() if this_cpu is about to become - * idle. Attempts to pull tasks from other CPUs. - * - * Returns: - * < 0 - we released the lock and there are !fair tasks present - * 0 - failed, no new tasks - * > 0 - success, new (fair) tasks present - */ -int newidle_balance(struct rq *this_rq, struct rq_flags *rf) -{ - unsigned long next_balance = jiffies + HZ; - int this_cpu = this_rq->cpu; - struct sched_domain *sd; - int pulled_task = 0; - u64 curr_cost = 0; - - update_misfit_status(NULL, this_rq); - /* - * We must set idle_stamp _before_ calling idle_balance(), such that we - * measure the duration of idle_balance() as idle time. - */ - this_rq->idle_stamp = rq_clock(this_rq); - - /* - * Do not pull tasks towards !active CPUs... - */ - if (!cpu_active(this_cpu)) - return 0; - - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - rq_unpin_lock(this_rq, rf); - - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !READ_ONCE(this_rq->rd->overload)) { - - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); - if (sd) - update_next_balance(sd, &next_balance); - rcu_read_unlock(); - - nohz_newidle_balance(this_rq); - - goto out; - } - - raw_spin_unlock(&this_rq->lock); - - update_blocked_averages(this_cpu); - rcu_read_lock(); - for_each_domain(this_cpu, sd) { - int continue_balancing = 1; - u64 t0, domain_cost; - - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); - break; - } - - if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); - - pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, - &continue_balancing); - - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; - - curr_cost += domain_cost; - } - - update_next_balance(sd, &next_balance); - - /* - * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. - */ - if (pulled_task || this_rq->nr_running > 0) - break; - } - rcu_read_unlock(); - - raw_spin_lock(&this_rq->lock); - - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - -out: - /* - * While browsing the domains, we released the rq lock, a task could - * have been enqueued in the meantime. Since we're not going idle, - * pretend we pulled a task. - */ - if (this_rq->cfs.h_nr_running && !pulled_task) - pulled_task = 1; - - /* Move the next balance forward */ - if (time_after(this_rq->next_balance, next_balance)) - this_rq->next_balance = next_balance; - - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running) - pulled_task = -1; - - if (pulled_task) - this_rq->idle_stamp = 0; - - rq_repin_lock(this_rq, rf); - - return pulled_task; -} - -/* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). - */ -static __latent_entropy void run_rebalance_domains(struct softirq_action *h) -{ - struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? - CPU_IDLE : CPU_NOT_IDLE; - - /* - * If this CPU has a pending nohz_balance_kick, then do the - * balancing on behalf of the other idle CPUs whose ticks are - * stopped. Do nohz_idle_balance *before* rebalance_domains to - * give the idle CPUs a chance to load balance. Else we may - * load balance only within the local sched_domain hierarchy - * and abort nohz_idle_balance altogether if we pull some load. - */ - if (nohz_idle_balance(this_rq, idle)) - return; - - /* normal load balance */ - update_blocked_averages(this_rq->cpu); - rebalance_domains(this_rq, idle); -} - -/* - * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - */ -void trigger_load_balance(struct rq *rq) -{ - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) - return; - - if (time_after_eq(jiffies, rq->next_balance)) - raise_softirq(SCHED_SOFTIRQ); - - nohz_balancer_kick(rq); -} static void rq_online_fair(struct rq *rq) { @@ -10640,9 +7822,6 @@ entity_tick(cfs_rq, se, queued); } - if (static_branch_unlikely(&sched_numa_balancing)) - task_tick_numa(rq, curr); - update_misfit_status(curr, rq); update_overutilized_status(task_rq(curr)); } @@ -10655,7 +7834,7 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, *curr; + struct sched_entity *curr; struct rq *rq = this_rq(); struct rq_flags rf; @@ -10666,20 +7845,9 @@ curr = cfs_rq->curr; if (curr) { update_curr(cfs_rq); - se->vruntime = curr->vruntime; } - place_entity(cfs_rq, se, 1); - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { - /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - swap(curr->vruntime, se->vruntime); - resched_curr(rq); - } - se->vruntime -= cfs_rq->min_vruntime; rq_unlock(rq, &rf); } @@ -10708,58 +7876,9 @@ check_preempt_curr(rq, p, 0); } -static inline bool vruntime_normalized(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - - /* - * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, - * the dequeue_entity(.flags=0) will already have normalized the - * vruntime. - */ - if (p->on_rq) - return true; - - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - A forked child which is waiting for being woken up by - * wake_up_new_task(). - * - A task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - */ - if (!se->sum_exec_runtime || - (p->state == TASK_WAKING && p->sched_remote_wakeup)) - return true; - - return false; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -/* - * Propagate the changes of the sched_entity across the tg tree to make it - * visible to the root - */ -static void propagate_entity_cfs_rq(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq; - - /* Start to propagate at parent */ - se = se->parent; - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - if (cfs_rq_throttled(cfs_rq)) - break; - - update_load_avg(cfs_rq, se, UPDATE_TG); - } -} -#else static void propagate_entity_cfs_rq(struct sched_entity *se) { } -#endif + static void detach_entity_cfs_rq(struct sched_entity *se) { @@ -10776,14 +7895,6 @@ { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); @@ -10794,29 +7905,13 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } - detach_entity_cfs_rq(se); } static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - attach_entity_cfs_rq(se); - - if (!vruntime_normalized(p)) - se->vruntime += cfs_rq->min_vruntime; } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -10879,6 +7974,8 @@ #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif + + cfs_rq->head = NULL; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -11203,7 +8300,6 @@ __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; diff --color -rubN linux-5.7.6/kernel/sched/sched.h linux-5.7.6.cachy/kernel/sched/sched.h --- linux-5.7.6/kernel/sched/sched.h 2020-06-25 01:49:26.000000000 +1000 +++ linux-5.7.6.cachy/kernel/sched/sched.h 2020-07-24 17:52:04.479461959 +1000 @@ -516,6 +516,7 @@ * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ + struct sched_entity *head; struct sched_entity *curr; struct sched_entity *next; struct sched_entity *last; @@ -541,50 +542,7 @@ unsigned long runnable_avg; } removed; -#ifdef CONFIG_FAIR_GROUP_SCHED - unsigned long tg_load_avg_contrib; - long propagate; - long prop_runnable_sum; - - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - u64 last_h_load_update; - struct sched_entity *h_load_next; -#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ - -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. - * This list is used during load balance. - */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; - s64 runtime_remaining; - - u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; - int throttled; - int throttle_count; - struct list_head throttled_list; -#endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ }; static inline int rt_bandwidth_enabled(void) diff --color -rubN linux-5.7.6/Makefile linux-5.7.6.cachy/Makefile --- linux-5.7.6/Makefile 2020-06-25 01:49:26.000000000 +1000 +++ linux-5.7.6.cachy/Makefile 2020-07-24 14:33:53.453645295 +1000 @@ -2,8 +2,8 @@ VERSION = 5 PATCHLEVEL = 7 SUBLEVEL = 6 -EXTRAVERSION = -NAME = Kleptomaniac Octopus +EXTRAVERSION = -cachy +NAME = Cachy # *DOCUMENTATION* # To see a list of typical targets execute "make help"