linux-tkg/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch

3937 lines
106 KiB
Diff
Raw Normal View History

Merge linux-tkg packages into a single package, add Void Linux and Clang/LLVM support (#63) * Add Clang/LLVM and Void Linux support * merge packages, update .gitignore * Stage 2 of merge * Stage 3 of merge * Stage 4 of merge * Stage 5 of mrege, almost done * Complete merge of linux-tkg, sync with master (undead PDS 5.8) * Complete merge of linux-tkg, sync with master (undead PDS 5.8) * Forgot to add all the patches * Fix prompt, add config_hardened files * Fix some stuff for Void * Merge linux-tkg README, add Void Linux info * typo * Fix broken MuQSS on Void at least, Fix CPU opts patch apply * update to 5.8.8, 5.4.64, block LLVM build with kernel 5.7 because it seems to segfault at linking vmlinux, fix RC building on Void Linux * update to 5.8.8, 5.4.64. seems making a confdir variable wasn't necessary, revert it. sync with master. * remove variables for messages in favor of defined functions in the Void template * resync Linux58-tkg back to master * Clear patches is the same file for every version * glitched base is the same file for 5.8 and 5.9, fix wrong version in kernel selection for 5.8, also just use * since it's the same .-. * merge some patches that are the same together, fix building 5.4 BMQ * BMQ and PDS ondemand are the same file, fix missing space on kernel version selection * add Clang makedepends to PKGBUILD, add missing compiler option in customization.cfg, make GCC recommended * Add kernel selection to install.sh * Somehow this wasn't edited * Add optional clang deps to install.sh * Update gitignore to just ignore linux-5.x instead of commenting it out * Missing fi * forgot to sync back install.sh... * generalize desktop profile configs, uncomment fsync and bcachefs in customization.cfg, add Project C /PDS and Undead PDS to README, credit plug, add kernel version selection to customization.cfg, fallback for compiler selection * Fix compiler selection error out on Void (for some reason xbps-src always runs the else statement) also avoid conflict with _compiler, rename it to _compiler_opt for the actual options in the make command * oops * Fix Void extra config not appearing, missing indent for _configfile, missed PKGBUILD in other commit... * remove globbing in install.sh * Move loading of external configuration to the beginning of _tkg_initscript * Change MuQSS note in README * 5.4.68, 5.8.12, 5.9-rc7 * Update Project C patchset to v5.8-r3 * 5.8 bcachefs * LLVM=1 was required to be used on make commands otherwise it would reset config, we don't need to know if gcc was used to build a kernel (since llvm is appended to llvm builds), remove the compiler_name if not using llvm, make kernel versions variables in prepare * _compileropt does not need to be llvm specific * add fix for AMD GPU DCN3.0 build with clang in glitched base 5.9 https://lore.kernel.org/amd-gfx/4b5927dd-9f2d-40f9-ef63-007cf7b65f37@amd.com/T/#t * Revert "add fix for AMD GPU DCN3.0 build with clang in glitched base 5.9" This reverts commit 276e219f9fe89397332c91e601f34a37b3a0503f. merged upstream * Sync with linux-tkg master * Just stick every patch into it's own kernel version folder * update 5.4.72, 5.8.16 * check for sum check fail on Void, fix fsync patch * Update README.md * README.md formatting * forgot to move that * linux59-tkg: Import 5.9 version of the bcachefs patchset - https://gthub.com/koverstreet/bcachefs Co-authored-by: Lukáš Horáček <flgx@protonmail.com>
2020-10-25 19:43:11 -04:00
diff --color -rubN linux-5.7.6/include/linux/sched.h linux-5.7.6.cachy/include/linux/sched.h
--- linux-5.7.6/include/linux/sched.h 2020-06-25 01:49:26.000000000 +1000
+++ linux-5.7.6.cachy/include/linux/sched.h 2020-07-24 17:51:45.879582847 +1000
@@ -452,9 +452,14 @@
/* For load-balancing: */
struct load_weight load;
struct rb_node run_node;
+
+ struct sched_entity* next[2];
+
struct list_head group_node;
unsigned int on_rq;
+ int quantom;
+
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
@@ -464,16 +469,6 @@
struct sched_statistics statistics;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- int depth;
- struct sched_entity *parent;
- /* rq on which this entity is (to be) queued: */
- struct cfs_rq *cfs_rq;
- /* rq "owned" by this entity/group: */
- struct cfs_rq *my_q;
- /* cached value of my_q->h_nr_running */
- unsigned long runnable_weight;
-#endif
#ifdef CONFIG_SMP
/*
diff --color -rubN linux-5.7.6/kernel/sched/core.c linux-5.7.6.cachy/kernel/sched/core.c
--- linux-5.7.6/kernel/sched/core.c 2020-06-25 01:49:26.000000000 +1000
+++ linux-5.7.6.cachy/kernel/sched/core.c 2020-07-24 17:51:57.991504128 +1000
@@ -2672,18 +2672,14 @@
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
- INIT_LIST_HEAD(&p->se.group_node);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- p->se.cfs_rq = NULL;
-#endif
+ INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
init_dl_inactive_task_timer(&p->dl);
__dl_clear_params(p);
@@ -3246,31 +3242,10 @@
#ifdef CONFIG_SMP
-/* rq->lock is NOT held, but preemption is disabled */
-static void __balance_callback(struct rq *rq)
-{
- struct callback_head *head, *next;
- void (*func)(struct rq *rq);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- head = rq->balance_callback;
- rq->balance_callback = NULL;
- while (head) {
- func = (void (*)(struct rq *))head->func;
- next = head->next;
- head->next = NULL;
- head = next;
-
- func(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
+///* rq->lock is NOT held, but preemption is disabled */
static inline void balance_callback(struct rq *rq)
{
- if (unlikely(rq->balance_callback))
- __balance_callback(rq);
}
#else
@@ -3606,7 +3581,6 @@
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
#endif
}
@@ -6574,23 +6548,12 @@
wait_bit_init();
-#ifdef CONFIG_FAIR_GROUP_SCHED
- ptr += 2 * nr_cpu_ids * sizeof(void **);
-#endif
#ifdef CONFIG_RT_GROUP_SCHED
ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
if (ptr) {
ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.se = (struct sched_entity **)ptr;
- ptr += nr_cpu_ids * sizeof(void **);
-
- root_task_group.cfs_rq = (struct cfs_rq **)ptr;
- ptr += nr_cpu_ids * sizeof(void **);
-
-#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -6641,32 +6604,7 @@
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt);
init_dl_rq(&rq->dl);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
- INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- /*
- * How much CPU bandwidth does root_task_group get?
- *
- * In case of task-groups formed thr' the cgroup filesystem, it
- * gets 100% of the CPU resources in the system. This overall
- * system CPU resource is divided among the tasks of
- * root_task_group and its child task-groups in a fair manner,
- * based on each entity's (task or task-group's) weight
- * (se->load.weight).
- *
- * In other words, if root_task_group has 10 tasks of weight
- * 1024) and two child groups A0 and A1 (of weight 1024 each),
- * then A0's share of the CPU resource is:
- *
- * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
- *
- * We achieve this by letting root_task_group's tasks sit
- * directly in rq->cfs (i.e root_task_group->se[] = NULL).
- */
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
- init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
diff --color -rubN linux-5.7.6/kernel/sched/debug.c linux-5.7.6.cachy/kernel/sched/debug.c
--- linux-5.7.6/kernel/sched/debug.c 2020-06-25 01:49:26.000000000 +1000
+++ linux-5.7.6.cachy/kernel/sched/debug.c 2020-07-24 17:52:15.419390856 +1000
@@ -385,7 +385,7 @@
return;
PN(se->exec_start);
- PN(se->vruntime);
+ //PN(se->vruntime);
PN(se->sum_exec_runtime);
if (schedstat_enabled()) {
@@ -437,9 +437,9 @@
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
- p->comm, task_pid_nr(p),
- SPLIT_NS(p->se.vruntime),
+ SEQ_printf(m, "%15s %5d %9d %9Ld %8d ",
+ p->comm, task_pid_nr(p), p->se.quantom,
+ //SPLIT_NS(p->se.vruntime),%9Ld.%06ld
(long long)(p->nvcsw + p->nivcsw),
p->prio);
@@ -464,9 +464,9 @@
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
- SEQ_printf(m, " S task PID tree-key switches prio"
+ SEQ_printf(m, " S task PID quantom switches prio"
" wait-time sum-exec sum-sleep\n");
- SEQ_printf(m, "-------------------------------------------------------"
+ SEQ_printf(m, "--------------------------------------------------------------------"
"----------------------------------------------------\n");
rcu_read_lock();
@@ -481,10 +481,8 @@
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
- spread, rq0_min_vruntime, spread0;
struct rq *rq = cpu_rq(cpu);
- struct sched_entity *last;
+ //struct sched_entity *last;
unsigned long flags;
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -498,26 +496,26 @@
SPLIT_NS(cfs_rq->exec_clock));
raw_spin_lock_irqsave(&rq->lock, flags);
- if (rb_first_cached(&cfs_rq->tasks_timeline))
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
- last = __pick_last_entity(cfs_rq);
- if (last)
- max_vruntime = last->vruntime;
- min_vruntime = cfs_rq->min_vruntime;
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
+ //if (rb_first_cached(&cfs_rq->tasks_timeline))
+ //MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
+ //last = __pick_last_entity(cfs_rq);
+ //if (last)
+ //max_vruntime = last->vruntime;
+ //min_vruntime = cfs_rq->min_vruntime;
+ //rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
raw_spin_unlock_irqrestore(&rq->lock, flags);
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
- SPLIT_NS(MIN_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
- SPLIT_NS(min_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
- SPLIT_NS(max_vruntime));
- spread = max_vruntime - MIN_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
- SPLIT_NS(spread));
- spread0 = min_vruntime - rq0_min_vruntime;
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
- SPLIT_NS(spread0));
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+ //SPLIT_NS(MIN_vruntime));
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
+ //SPLIT_NS(min_vruntime));
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
+ //SPLIT_NS(max_vruntime));
+ //spread = max_vruntime - MIN_vruntime;
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
+ //SPLIT_NS(spread));
+ //spread0 = min_vruntime - rq0_min_vruntime;
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
+ //SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
@@ -875,7 +873,7 @@
#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
PN(se.exec_start);
- PN(se.vruntime);
+ //PN(se.vruntime);
PN(se.sum_exec_runtime);
nr_switches = p->nvcsw + p->nivcsw;
diff --color -rubN linux-5.7.6/kernel/sched/fair.c linux-5.7.6.cachy/kernel/sched/fair.c
--- linux-5.7.6/kernel/sched/fair.c 2020-06-25 01:49:26.000000000 +1000
+++ linux-5.7.6.cachy/kernel/sched/fair.c 2020-07-24 17:52:09.159431543 +1000
@@ -86,6 +86,9 @@
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#define DIR_RIGHT 0
+#define DIR_LEFT 1
+
int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -259,193 +262,6 @@
* CFS operations on generic schedulable entities:
*/
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline struct task_struct *task_of(struct sched_entity *se)
-{
- SCHED_WARN_ON(!entity_is_task(se));
- return container_of(se, struct task_struct, se);
-}
-
-/* Walk up scheduling entities hierarchy */
-#define for_each_sched_entity(se) \
- for (; se; se = se->parent)
-
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
- return p->se.cfs_rq;
-}
-
-/* runqueue on which this entity is (to be) queued */
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
-{
- return se->cfs_rq;
-}
-
-/* runqueue "owned" by this group */
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
-{
- return grp->my_q;
-}
-
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (!path)
- return;
-
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- autogroup_path(cfs_rq->tg, path, len);
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- else
- strlcpy(path, "(null)", len);
-}
-
-static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
-
- if (cfs_rq->on_list)
- return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
-
- cfs_rq->on_list = 1;
-
- /*
- * Ensure we either appear before our parent (if already
- * enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases and a special case for the root
- * cfs_rq. Furthermore, it also means that we will always reset
- * tmp_alone_branch either when the branch is connected
- * to a tree or when we reach the top of the tree
- */
- if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
- /*
- * If parent is already on the list, we add the child
- * just before. Thanks to circular linked property of
- * the list, this means to put the child at the tail
- * of the list that starts by parent.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
- /*
- * The branch is now connected to its tree so we can
- * reset tmp_alone_branch to the beginning of the
- * list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- return true;
- }
-
- if (!cfs_rq->tg->parent) {
- /*
- * cfs rq without parent should be put
- * at the tail of the list.
- */
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq->leaf_cfs_rq_list);
- /*
- * We have reach the top of a tree so we can reset
- * tmp_alone_branch to the beginning of the list.
- */
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
- return true;
- }
-
- /*
- * The parent has not already been added so we want to
- * make sure that it will be put after us.
- * tmp_alone_branch points to the begin of the branch
- * where we will add parent.
- */
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
- /*
- * update tmp_alone_branch to points to the new begin
- * of the branch
- */
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
- return false;
-}
-
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->on_list) {
- struct rq *rq = rq_of(cfs_rq);
-
- /*
- * With cfs_rq being unthrottled/throttled during an enqueue,
- * it can happen the tmp_alone_branch points the a leaf that
- * we finally want to del. In this case, tmp_alone_branch moves
- * to the prev element but it will point to rq->leaf_cfs_rq_list
- * at the end of the enqueue.
- */
- if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
- rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
-
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- cfs_rq->on_list = 0;
- }
-}
-
-static inline void assert_list_leaf_cfs_rq(struct rq *rq)
-{
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
-}
-
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
- leaf_cfs_rq_list)
-
-/* Do the two (enqueued) entities belong to the same group ? */
-static inline struct cfs_rq *
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- if (se->cfs_rq == pse->cfs_rq)
- return se->cfs_rq;
-
- return NULL;
-}
-
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
-{
- return se->parent;
-}
-
-static void
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
-{
- int se_depth, pse_depth;
-
- /*
- * preemption test can be made between sibling entities who are in the
- * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
- * both tasks until we find their ancestors who are siblings of common
- * parent.
- */
-
- /* First walk up until both entities are at same depth */
- se_depth = (*se)->depth;
- pse_depth = (*pse)->depth;
-
- while (se_depth > pse_depth) {
- se_depth--;
- *se = parent_entity(*se);
- }
-
- while (pse_depth > se_depth) {
- pse_depth--;
- *pse = parent_entity(*pse);
- }
-
- while (!is_same_group(*se, *pse)) {
- *se = parent_entity(*se);
- *pse = parent_entity(*pse);
- }
-}
-
-#else /* !CONFIG_FAIR_GROUP_SCHED */
static inline struct task_struct *task_of(struct sched_entity *se)
{
@@ -506,138 +322,67 @@
{
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
-/**************************************************************
- * Scheduling class tree data structure manipulation methods:
+/*
+ * Enqueue an entity
*/
-
-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- s64 delta = (s64)(vruntime - max_vruntime);
- if (delta > 0)
- max_vruntime = vruntime;
+ se->next[DIR_RIGHT] = NULL;
+ se->next[DIR_LEFT] = NULL;
- return max_vruntime;
-}
+ if (likely(cfs_rq->head))
+ {
+ se->next[DIR_RIGHT] = cfs_rq->head;
+ cfs_rq->head->next[DIR_LEFT] = se;
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
-{
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta < 0)
- min_vruntime = vruntime;
+ // lastly reset the head
+ cfs_rq->head = se;
- return min_vruntime;
-}
+ return;
+ }
-static inline int entity_before(struct sched_entity *a,
- struct sched_entity *b)
-{
- return (s64)(a->vruntime - b->vruntime) < 0;
+ // if empty rq
+ cfs_rq->head = se;
}
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct sched_entity *curr = cfs_rq->curr;
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
-
- u64 vruntime = cfs_rq->min_vruntime;
- if (curr) {
- if (curr->on_rq)
- vruntime = curr->vruntime;
- else
- curr = NULL;
+ // if only one se in rq
+ if (unlikely(cfs_rq->head->next[DIR_RIGHT] == NULL))
+ cfs_rq->head = NULL;
+ else if (unlikely(se == cfs_rq->head))
+ {
+ // if it is the head
+ cfs_rq->head = cfs_rq->head->next[DIR_RIGHT];
+ cfs_rq->head->next[DIR_LEFT] = NULL;
}
-
- if (leftmost) { /* non-empty tree */
- struct sched_entity *se;
- se = rb_entry(leftmost, struct sched_entity, run_node);
-
- if (!curr)
- vruntime = se->vruntime;
else
- vruntime = min_vruntime(vruntime, se->vruntime);
- }
-
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
+ {
+ // if in the middle
+ struct sched_entity *prev = se->next[DIR_LEFT];
+ struct sched_entity *next = se->next[DIR_RIGHT];
-/*
- * Enqueue an entity into the rb-tree:
- */
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
- struct rb_node *parent = NULL;
- struct sched_entity *entry;
- bool leftmost = true;
+ prev->next[DIR_RIGHT] = next;
- /*
- * Find the right place in the rbtree:
- */
- while (*link) {
- parent = *link;
- entry = rb_entry(parent, struct sched_entity, run_node);
- /*
- * We dont care about collisions. Nodes with
- * the same key stay together.
- */
- if (entity_before(se, entry)) {
- link = &parent->rb_left;
- } else {
- link = &parent->rb_right;
- leftmost = false;
+ if (next)
+ next->next[DIR_LEFT] = prev;
}
- }
-
- rb_link_node(&se->run_node, parent, link);
- rb_insert_color_cached(&se->run_node,
- &cfs_rq->tasks_timeline, leftmost);
-}
-
-static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
}
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
-
- if (!left)
- return NULL;
-
- return rb_entry(left, struct sched_entity, run_node);
-}
-
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
-{
- struct rb_node *next = rb_next(&se->run_node);
-
- if (!next)
- return NULL;
-
- return rb_entry(next, struct sched_entity, run_node);
+ return cfs_rq->head;
}
#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
-
- if (!last)
- return NULL;
-
- return rb_entry(last, struct sched_entity, run_node);
+ return cfs_rq->head;
}
/**************************************************************
@@ -723,16 +468,6 @@
return slice;
}
-/*
- * We calculate the vruntime slice of a to-be-inserted task.
- *
- * vs = s/w
- */
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
-}
-
#include "pelt.h"
#ifdef CONFIG_SMP
@@ -856,6 +591,7 @@
return;
curr->exec_start = now;
+ curr->quantom++;
schedstat_set(curr->statistics.exec_max,
max(delta_exec, curr->statistics.exec_max));
@@ -864,12 +600,10 @@
schedstat_add(cfs_rq->exec_clock, delta_exec);
curr->vruntime += calc_delta_fair(delta_exec, curr);
- update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
@@ -2897,39 +2631,6 @@
}
}
-/*
- * Drive the periodic memory faults..
- */
-static void task_tick_numa(struct rq *rq, struct task_struct *curr)
-{
- struct callback_head *work = &curr->numa_work;
- u64 period, now;
-
- /*
- * We don't care about NUMA placement if we don't have memory.
- */
- if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
- return;
-
- /*
- * Using runtime rather than walltime has the dual advantage that
- * we (mostly) drive the selection from busy threads and that the
- * task needs to have done some actual work before we bother with
- * NUMA placement.
- */
- now = curr->se.sum_exec_runtime;
- period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
-
- if (now > curr->node_stamp + period) {
- if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_start(curr);
- curr->node_stamp += period;
-
- if (!time_before(jiffies, curr->mm->numa_next_scan))
- task_work_add(curr, work, true);
- }
-}
-
static void update_scan_period(struct task_struct *p, int new_cpu)
{
int src_nid = cpu_to_node(task_cpu(p));
@@ -2965,9 +2666,6 @@
}
#else
-static void task_tick_numa(struct rq *rq, struct task_struct *curr)
-{
-}
static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
@@ -4072,50 +3770,9 @@
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
- s64 d = se->vruntime - cfs_rq->min_vruntime;
-
- if (d < 0)
- d = -d;
-
- if (d > 3*sysctl_sched_latency)
- schedstat_inc(cfs_rq->nr_spread_over);
#endif
}
-static void
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
-{
- u64 vruntime = cfs_rq->min_vruntime;
-
- /*
- * The 'current' period is already promised to the current tasks,
- * however the extra weight of the new task will slow them down a
- * little, place the new task so that it fits in the slot that
- * stays open at the end.
- */
- if (initial && sched_feat(START_DEBIT))
- vruntime += sched_vslice(cfs_rq, se);
-
- /* sleeps up to a single latency don't count. */
- if (!initial) {
- unsigned long thresh = sysctl_sched_latency;
-
- /*
- * Halve their sleep time's effect, to allow
- * for a gentler effect of sleepers:
- */
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
- thresh >>= 1;
-
- vruntime -= thresh;
- }
-
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
-}
-
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
-
static inline void check_schedstat_required(void)
{
#ifdef CONFIG_SCHEDSTATS
@@ -4171,28 +3828,11 @@
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
- /*
- * If we're the current task, we must renormalise before calling
- * update_curr().
- */
- if (renorm && curr)
- se->vruntime += cfs_rq->min_vruntime;
-
update_curr(cfs_rq);
/*
- * Otherwise, renormalise after, such that we're placed at the current
- * moment in time, instead of some random moment in the past. Being
- * placed in the past could significantly boost this task to the
- * fairness detriment of existing tasks.
- */
- if (renorm && !curr)
- se->vruntime += cfs_rq->min_vruntime;
-
- /*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - Add its load to cfs_rq->runnable_avg
@@ -4205,71 +3845,12 @@
update_cfs_group(se);
account_entity_enqueue(cfs_rq, se);
- if (flags & ENQUEUE_WAKEUP)
- place_entity(cfs_rq, se, 0);
-
check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
-
- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionnally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
- list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
- check_enqueue_throttle(cfs_rq);
-}
-
-static void __clear_buddies_last(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last != se)
- break;
-
- cfs_rq->last = NULL;
- }
-}
-
-static void __clear_buddies_next(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next != se)
- break;
-
- cfs_rq->next = NULL;
- }
-}
-
-static void __clear_buddies_skip(struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip != se)
- break;
-
- cfs_rq->skip = NULL;
- }
-}
-
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (cfs_rq->last == se)
- __clear_buddies_last(se);
-
- if (cfs_rq->next == se)
- __clear_buddies_next(se);
-
- if (cfs_rq->skip == se)
- __clear_buddies_skip(se);
}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
@@ -4295,75 +3876,15 @@
update_stats_dequeue(cfs_rq, se, flags);
- clear_buddies(cfs_rq, se);
-
- if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+
se->on_rq = 0;
account_entity_dequeue(cfs_rq, se);
- /*
- * Normalize after update_curr(); which will also have moved
- * min_vruntime if @se is the one holding it back. But before doing
- * update_min_vruntime() again, which will discount @se's position and
- * can move min_vruntime forward still more.
- */
- if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
-
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_cfs_group(se);
-
- /*
- * Now advance min_vruntime if @se was the entity holding it back,
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- * put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- ie. we'll be penalized.
- */
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
- update_min_vruntime(cfs_rq);
-}
-
-/*
- * Preempt the current task with a newly woken task if needed:
- */
-static void
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- unsigned long ideal_runtime, delta_exec;
- struct sched_entity *se;
- s64 delta;
-
- ideal_runtime = sched_slice(cfs_rq, curr);
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime) {
- resched_curr(rq_of(cfs_rq));
- /*
- * The current task ran long enough, ensure it doesn't get
- * re-elected due to buddy favours.
- */
- clear_buddies(cfs_rq, curr);
- return;
- }
-
- /*
- * Ensure that a task that missed wakeup preemption by a
- * narrow margin doesn't have to wait for a full slice.
- * This also mitigates buddy induced latencies under load.
- */
- if (delta_exec < sysctl_sched_min_granularity)
- return;
-
- se = __pick_first_entity(cfs_rq);
- delta = curr->vruntime - se->vruntime;
-
- if (delta < 0)
- return;
-
- if (delta > ideal_runtime)
- resched_curr(rq_of(cfs_rq));
}
static void
@@ -4371,96 +3892,18 @@
{
/* 'current' is not kept within the tree. */
if (se->on_rq) {
- /*
- * Any task has to be enqueued before it get to execute on
- * a CPU. So account for the time it spent waiting on the
- * runqueue.
- */
update_stats_wait_end(cfs_rq, se);
- __dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
- /*
- * Track our maximum slice length, if the CPU's load is at
- * least twice that of our own weight (i.e. dont track it
- * when there are only lesser-weight tasks around):
- */
- if (schedstat_enabled() &&
- rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
- schedstat_set(se->statistics.slice_max,
- max((u64)schedstat_val(se->statistics.slice_max),
- se->sum_exec_runtime - se->prev_sum_exec_runtime));
- }
-
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-
-/*
- * Pick the next process, keeping these things in mind, in this order:
- * 1) keep things fair between processes/task groups
- * 2) pick the "next" process, since someone really wants that to run
- * 3) pick the "last" process, for cache locality
- * 4) do not run the "skip" process, if something else is available
- */
-static struct sched_entity *
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-{
- struct sched_entity *left = __pick_first_entity(cfs_rq);
- struct sched_entity *se;
-
- /*
- * If curr is set we have to see if its left of the leftmost entity
- * still in the tree, provided there was anything in the tree at all.
- */
- if (!left || (curr && entity_before(curr, left)))
- left = curr;
-
- se = left; /* ideally we run the leftmost entity */
-
- /*
- * Avoid running the skip buddy, if running something else can
- * be done without getting too unfair.
- */
- if (cfs_rq->skip == se) {
- struct sched_entity *second;
-
- if (se == curr) {
- second = __pick_first_entity(cfs_rq);
- } else {
- second = __pick_next_entity(se);
- if (!second || (curr && entity_before(curr, second)))
- second = curr;
- }
-
- if (second && wakeup_preempt_entity(second, left) < 1)
- se = second;
- }
-
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
-
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
-
- clear_buddies(cfs_rq, se);
-
- return se;
-}
-
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -4471,21 +3914,19 @@
if (prev->on_rq)
update_curr(cfs_rq);
- /* throttle cfs_rqs exceeding runtime */
- check_cfs_rq_runtime(cfs_rq);
-
- check_spread(cfs_rq, prev);
-
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
- /* Put 'current' back into the tree. */
- __enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
cfs_rq->curr = NULL;
}
+static int check_preempt_curr_fair(struct sched_entity *curr)
+{
+ return 1;
+}
+
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
@@ -4509,6 +3950,12 @@
resched_curr(rq_of(cfs_rq));
return;
}
+
+ if (check_preempt_curr_fair(curr) == 1) {
+ resched_curr(rq_of(cfs_rq));
+ return;
+ }
+
/*
* don't let the period tick interfere with the hrtick preemption
*/
@@ -4516,9 +3963,6 @@
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
-
- if (cfs_rq->nr_running > 1)
- check_preempt_tick(cfs_rq, curr);
}
@@ -5082,30 +4526,6 @@
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
-/*
- * When a group wakes up we want to make sure that its quota is not already
- * expired/exceeded, otherwise it may be allowed to steal additional ticks of
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
- */
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
-{
- if (!cfs_bandwidth_used())
- return;
-
- /* an active group must be handled by the update_curr()->put() path */
- if (!cfs_rq->runtime_enabled || cfs_rq->curr)
- return;
-
- /* ensure the group is not already throttled */
- if (cfs_rq_throttled(cfs_rq))
- return;
-
- /* update runtime allocation */
- account_cfs_rq_runtime(cfs_rq, 0);
- if (cfs_rq->runtime_remaining <= 0)
- throttle_cfs_rq(cfs_rq);
-}
-
static void sync_throttle(struct task_group *tg, int cpu)
{
struct cfs_rq *pcfs_rq, *cfs_rq;
@@ -5123,26 +4543,6 @@
cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
}
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- if (!cfs_bandwidth_used())
- return false;
-
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return false;
-
- /*
- * it's possible for a throttled entity to be forced into a running
- * state (e.g. set_curr_task), in this case we're finished.
- */
- if (cfs_rq_throttled(cfs_rq))
- return true;
-
- throttle_cfs_rq(cfs_rq);
- return true;
-}
-
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
@@ -5318,8 +4718,6 @@
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -5548,8 +4946,6 @@
hrtick_update(rq);
}
-static void set_next_buddy(struct sched_entity *se);
-
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
@@ -5578,12 +4974,6 @@
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
- /*
- * Bias pick_next to pick a task from this cfs_rq, as
- * p is sleeping when it is within its sched_slice.
- */
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
- set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
@@ -5699,53 +5089,6 @@
return cpu_rq(cpu)->cpu_capacity;
}
-static void record_wakee(struct task_struct *p)
-{
- /*
- * Only decay a single time; tasks that have less then 1 wakeup per
- * jiffy will not have built up many flips.
- */
- if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
- current->wakee_flips >>= 1;
- current->wakee_flip_decay_ts = jiffies;
- }
-
- if (current->last_wakee != p) {
- current->last_wakee = p;
- current->wakee_flips++;
- }
-}
-
-/*
- * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
- *
- * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.
- *
- * In order to determine whether we should let the load spread vs consolidating
- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.
- *
- * With both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.
- *
- * Waker/wakee being client/server, worker/dispatcher, interrupt source or
- * whatever is irrelevant, spread criteria is apparent partner count exceeds
- * socket size.
- */
-static int wake_wide(struct task_struct *p)
-{
- unsigned int master = current->wakee_flips;
- unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
-
- if (master < slave)
- swap(master, slave);
- if (slave < factor || master < slave * factor)
- return 0;
- return 1;
-}
-
/*
* The purpose of wake_affine() is to quickly determine on which CPU we can run
* soonest. For the purpose of speed we only consider the waking and previous
@@ -6402,238 +5745,6 @@
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
-/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
- */
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
-{
- struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /*
- * If @p migrates from @cpu to another, remove its contribution. Or,
- * if @p migrates from another CPU to @cpu, add its contribution. In
- * the other cases, @cpu is not impacted by the migration, so the
- * util_avg should already be correct.
- */
- if (task_cpu(p) == cpu && dst_cpu != cpu)
- sub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
- util += task_util(p);
-
- if (sched_feat(UTIL_EST)) {
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
- /*
- * During wake-up, the task isn't enqueued yet and doesn't
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- * so just add it (if needed) to "simulate" what will be
- * cpu_util() after the task has been enqueued.
- */
- if (dst_cpu == cpu)
- util_est += _task_util_est(p);
-
- util = max(util, util_est);
- }
-
- return min(util, capacity_orig_of(cpu));
-}
-
-/*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
- * to compute what would be the energy if we decided to actually migrate that
- * task.
- */
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
-{
- struct cpumask *pd_mask = perf_domain_span(pd);
- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- unsigned long max_util = 0, sum_util = 0;
- int cpu;
-
- /*
- * The capacity state of CPUs of the current rd can be driven by CPUs
- * of another rd if they belong to the same pd. So, account for the
- * utilization of these CPUs too by masking pd with cpu_online_mask
- * instead of the rd span.
- *
- * If an entire pd is outside of the current rd, it will not appear in
- * its pd list and will not be accounted by compute_energy().
- */
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
- struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
-
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- ENERGY_UTIL, NULL);
-
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the
- * FREQUENCY_UTIL's utilization can be max OPP.
- */
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
- }
-
- return em_pd_energy(pd->em_pd, max_util, sum_util);
-}
-
-/*
- * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
- * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
- * spare capacity in each performance domain and uses it as a potential
- * candidate to execute the task. Then, it uses the Energy Model to figure
- * out which of the CPU candidates is the most energy-efficient.
- *
- * The rationale for this heuristic is as follows. In a performance domain,
- * all the most energy efficient CPU candidates (according to the Energy
- * Model) are those for which we'll request a low frequency. When there are
- * several CPUs for which the frequency request will be the same, we don't
- * have enough data to break the tie between them, because the Energy Model
- * only includes active power costs. With this model, if we assume that
- * frequency requests follow utilization (e.g. using schedutil), the CPU with
- * the maximum spare capacity in a performance domain is guaranteed to be among
- * the best candidates of the performance domain.
- *
- * In practice, it could be preferable from an energy standpoint to pack
- * small tasks on a CPU in order to let other CPUs go in deeper idle states,
- * but that could also hurt our chances to go cluster idle, and we have no
- * ways to tell with the current Energy Model if this is actually a good
- * idea or not. So, find_energy_efficient_cpu() basically favors
- * cluster-packing, and spreading inside a cluster. That should at least be
- * a good thing for latency, and this is consistent with the idea that most
- * of the energy savings of EAS come from the asymmetry of the system, and
- * not so much from breaking the tie between identical CPUs. That's also the
- * reason why EAS is enabled in the topology code only for systems where
- * SD_ASYM_CPUCAPACITY is set.
- *
- * NOTE: Forkees are not accepted in the energy-aware wake-up path because
- * they don't have any useful utilization data yet and it's not possible to
- * forecast their impact on energy consumption. Consequently, they will be
- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
- * to be energy-inefficient in some use-cases. The alternative would be to
- * bias new tasks towards specific types of CPUs first, or to try to infer
- * their util_avg from the parent task, but those heuristics could hurt
- * other use-cases too. So, until someone finds a better way to solve this,
- * let's keep things simple by re-using the existing slow path.
- */
-static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
-{
- unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- unsigned long cpu_cap, util, base_energy = 0;
- int cpu, best_energy_cpu = prev_cpu;
- struct sched_domain *sd;
- struct perf_domain *pd;
-
- rcu_read_lock();
- pd = rcu_dereference(rd->pd);
- if (!pd || READ_ONCE(rd->overutilized))
- goto fail;
-
- /*
- * Energy-aware wake-up happens on the lowest sched_domain starting
- * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
- */
- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
- while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
- sd = sd->parent;
- if (!sd)
- goto fail;
-
- sync_entity_load_avg(&p->se);
- if (!task_util_est(p))
- goto unlock;
-
- for (; pd; pd = pd->next) {
- unsigned long cur_delta, spare_cap, max_spare_cap = 0;
- unsigned long base_energy_pd;
- int max_spare_cap_cpu = -1;
-
- /* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
-
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
- continue;
-
- util = cpu_util_next(cpu, p, cpu);
- cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap - util;
-
- /*
- * Skip CPUs that cannot satisfy the capacity request.
- * IOW, placing the task there would make the CPU
- * overutilized. Take uclamp into account to see how
- * much capacity we can get out of the CPU; this is
- * aligned with schedutil_cpu_util().
- */
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- if (!fits_capacity(util, cpu_cap))
- continue;
-
- /* Always use prev_cpu as a candidate. */
- if (cpu == prev_cpu) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- prev_delta -= base_energy_pd;
- best_delta = min(best_delta, prev_delta);
- }
-
- /*
- * Find the CPU with the maximum spare capacity in
- * the performance domain
- */
- if (spare_cap > max_spare_cap) {
- max_spare_cap = spare_cap;
- max_spare_cap_cpu = cpu;
- }
- }
-
- /* Evaluate the energy impact of using this CPU. */
- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
- cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
- cur_delta -= base_energy_pd;
- if (cur_delta < best_delta) {
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- }
- }
- }
-unlock:
- rcu_read_unlock();
-
- /*
- * Pick the best CPU if prev_cpu cannot be used, or if it saves at
- * least 6% of the energy used by prev_cpu.
- */
- if (prev_delta == ULONG_MAX)
- return best_energy_cpu;
-
- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
- return best_energy_cpu;
-
- return prev_cpu;
-
-fail:
- rcu_read_unlock();
-
- return -1;
-}
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
@@ -6656,19 +5767,6 @@
int want_affine = 0;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
- if (sd_flag & SD_BALANCE_WAKE) {
- record_wakee(p);
-
- if (sched_energy_enabled()) {
- new_cpu = find_energy_efficient_cpu(p, prev_cpu);
- if (new_cpu >= 0)
- return new_cpu;
- new_cpu = prev_cpu;
- }
-
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
- }
-
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
@@ -6696,7 +5794,9 @@
if (unlikely(sd)) {
/* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
- } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+ }
+
+ else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
@@ -6718,59 +5818,6 @@
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
- /*
- * As blocked tasks retain absolute vruntime the migration needs to
- * deal with this by subtracting the old and adding the new
- * min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
- if (p->state == TASK_WAKING) {
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
-
- se->vruntime -= min_vruntime;
- }
-
- if (p->on_rq == TASK_ON_RQ_MIGRATING) {
- /*
- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
- * rq->lock and can modify state directly.
- */
- lockdep_assert_held(&task_rq(p)->lock);
- detach_entity_cfs_rq(&p->se);
-
- } else {
- /*
- * We are supposed to update the task to "current" time, then
- * its up to date and ready to go to new CPU/cfs_rq. But we
- * have difficulty in getting what current time is, so simply
- * throw away the out-of-date time. This will result in the
- * wakee task is less decayed, but giving the wakee more load
- * sounds not bad.
- */
- remove_entity_load_avg(&p->se);
- }
-
- /* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
-
- /* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
-
- update_scan_period(p, new_cpu);
}
static void task_dead_fair(struct task_struct *p)
@@ -6781,32 +5828,10 @@
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- if (rq->nr_running)
return 1;
-
- return newidle_balance(rq, rf) != 0;
}
#endif /* CONFIG_SMP */
-static unsigned long wakeup_gran(struct sched_entity *se)
-{
- unsigned long gran = sysctl_sched_wakeup_granularity;
-
- /*
- * Since its curr running now, convert the gran from real-time
- * to virtual-time in his units.
- *
- * By using 'se' instead of 'curr' we penalize light tasks, so
- * they get preempted easier. That is, if 'se' < 'curr' then
- * the resulting gran will be larger, therefore penalizing the
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
- * be smaller, again penalizing the lighter task.
- *
- * This is especially important for buddies when the leftmost
- * task is higher priority than the buddy.
- */
- return calc_delta_fair(gran, se);
-}
/*
* Should 'se' preempt 'curr'.
@@ -6817,54 +5842,43 @@
* g
* |<--->|c
*
- * w(c, s1) = -1
+ * w(c, s1) = -1 // don't preempt
* w(c, s2) = 0
- * w(c, s3) = 1
+ * w(c, s3) = 1 // preempt
*
*/
static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
- s64 gran, vdiff = curr->vruntime - se->vruntime;
-
- if (vdiff <= 0)
- return -1;
-
- gran = wakeup_gran(se);
- if (vdiff > gran)
- return 1;
-
- return 0;
-}
-
-static void set_last_buddy(struct sched_entity *se)
+wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
+ u64 r_curr, r_se, w_curr, w_se;
+ struct task_struct *t_curr = task_of(curr);
+ struct task_struct *t_se = task_of(se);
+ u64 vr_curr = curr->sum_exec_runtime + 1;
+ u64 vr_se = se->sum_exec_runtime + 1;
+ s64 diff;
+
+ w_curr = (now - t_curr->start_boottime) - vr_curr;
+ w_se = (now - t_se->start_boottime) - vr_se;
+
+ w_curr *= (140 - t_curr->prio);
+ w_se *= (140 - t_se->prio);
+
+ r_curr = w_curr / vr_curr;
+ r_se = w_se / vr_se;
+ diff = (s64)(r_se) - (s64)(r_curr);
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- cfs_rq_of(se)->last = se;
+ if (diff == 0)
+ {
+ r_curr = w_curr % vr_curr;
+ r_se = w_se % vr_se;
+ diff = (s64)(r_se) - (s64)(r_curr);
}
-}
-static void set_next_buddy(struct sched_entity *se)
-{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
- for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
- return;
- cfs_rq_of(se)->next = se;
- }
-}
+ if (diff > 0)
+ return 1;
-static void set_skip_buddy(struct sched_entity *se)
-{
- for_each_sched_entity(se)
- cfs_rq_of(se)->skip = se;
+ return -1;
}
/*
@@ -6874,28 +5888,12 @@
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- int scale = cfs_rq->nr_running >= sched_nr_latency;
- int next_buddy_marked = 0;
+ u64 now = rq_clock_task(rq);
if (unlikely(se == pse))
return;
/*
- * This is possible from callers such as attach_tasks(), in which we
- * unconditionally check_prempt_curr() after an enqueue (which may have
- * lead to a throttle). This both saves work and prevents false
- * next-buddy nomination below.
- */
- if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
- return;
-
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
- set_next_buddy(pse);
- next_buddy_marked = 1;
- }
-
- /*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
*
@@ -6923,13 +5921,7 @@
find_matching_se(&se, &pse);
update_curr(cfs_rq_of(se));
BUG_ON(!pse);
- if (wakeup_preempt_entity(se, pse) == 1) {
- /*
- * Bias pick_next to pick the sched entity that is
- * triggering this preemption.
- */
- if (!next_buddy_marked)
- set_next_buddy(pse);
+ if (wakeup_preempt_entity(now, se, pse) == 1) {
goto preempt;
}
@@ -6948,113 +5940,36 @@
*/
if (unlikely(!se->on_rq || curr == rq->idle))
return;
-
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
- set_last_buddy(se);
}
struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
- struct sched_entity *se;
+ struct sched_entity *se, *next;
struct task_struct *p;
- int new_tasks;
+ u64 now = rq_clock_task(rq);
-again:
- if (!sched_fair_runnable(rq))
+ if (unlikely(!sched_fair_runnable(rq)))
goto idle;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (!prev || prev->sched_class != &fair_sched_class)
- goto simple;
-
- /*
- * Because of the set_next_buddy() in dequeue_task_fair() it is rather
- * likely that a next task is from the same cgroup as the current.
- *
- * Therefore attempt to avoid putting and setting the entire cgroup
- * hierarchy, only change the part that actually changes.
- */
+ se = next = cfs_rq->head;
+ next = next->next[DIR_RIGHT];
- do {
- struct sched_entity *curr = cfs_rq->curr;
-
- /*
- * Since we got here without doing put_prev_entity() we also
- * have to consider cfs_rq->curr. If it is still a runnable
- * entity, update_curr() will update its vruntime, otherwise
- * forget we've ever seen it.
- */
- if (curr) {
- if (curr->on_rq)
- update_curr(cfs_rq);
- else
- curr = NULL;
-
- /*
- * This call to check_cfs_rq_runtime() will do the
- * throttle and dequeue its entity in the parent(s).
- * Therefore the nr_running test will indeed
- * be correct.
- */
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
- cfs_rq = &rq->cfs;
-
- if (!cfs_rq->nr_running)
- goto idle;
-
- goto simple;
- }
- }
-
- se = pick_next_entity(cfs_rq, curr);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
-
- p = task_of(se);
-
- /*
- * Since we haven't yet done put_prev_entity and if the selected task
- * is a different task than we started out with, try and touch the
- * least amount of cfs_rqs.
- */
- if (prev != p) {
- struct sched_entity *pse = &prev->se;
-
- while (!(cfs_rq = is_same_group(se, pse))) {
- int se_depth = se->depth;
- int pse_depth = pse->depth;
-
- if (se_depth <= pse_depth) {
- put_prev_entity(cfs_rq_of(pse), pse);
- pse = parent_entity(pse);
- }
- if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se);
- se = parent_entity(se);
- }
- }
+ while (next)
+ {
+ if (wakeup_preempt_entity(now, se, next) == 1)
+ se = next;
- put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se);
+ next = next->next[DIR_RIGHT];
}
- goto done;
-simple:
-#endif
- if (prev)
- put_prev_task(rq, prev);
-
- do {
- se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
- cfs_rq = group_cfs_rq(se);
- } while (cfs_rq);
p = task_of(se);
-done: __maybe_unused;
+ se->quantom = 0;
+
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
@@ -7075,19 +5990,6 @@
if (!rf)
return NULL;
- new_tasks = newidle_balance(rq, rf);
-
- /*
- * Because newidle_balance() releases (and re-acquires) rq->lock, it is
- * possible for any higher priority task to appear. In that case we
- * must re-start the pick_next_entity() loop.
- */
- if (new_tasks < 0)
- return RETRY_TASK;
-
- if (new_tasks > 0)
- goto again;
-
/*
* rq is about to be idle, check if we need to update the
* lost_idle_time of clock_pelt
@@ -7125,7 +6027,6 @@
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- struct sched_entity *se = &curr->se;
/*
* Are we the only task in the tree?
@@ -7133,8 +6034,6 @@
if (unlikely(rq->nr_running == 1))
return;
- clear_buddies(cfs_rq, se);
-
if (curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
@@ -7148,8 +6047,6 @@
*/
rq_clock_skip_update(rq);
}
-
- set_skip_buddy(se);
}
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
@@ -7160,9 +6057,6 @@
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
- /* Tell the scheduler that we'd really like pse to run next. */
- set_next_buddy(se);
-
yield_task_fair(rq);
return true;
@@ -7370,39 +6264,6 @@
struct list_head tasks;
};
-/*
- * Is this task likely cache-hot:
- */
-static int task_hot(struct task_struct *p, struct lb_env *env)
-{
- s64 delta;
-
- lockdep_assert_held(&env->src_rq->lock);
-
- if (p->sched_class != &fair_sched_class)
- return 0;
-
- if (unlikely(task_has_idle_policy(p)))
- return 0;
-
- /*
- * Buddy candidates are cache hot:
- */
- if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
- return 1;
-
- if (sysctl_sched_migration_cost == -1)
- return 1;
- if (sysctl_sched_migration_cost == 0)
- return 0;
-
- delta = rq_clock_task(env->src_rq) - p->se.exec_start;
-
- return delta < (s64)sysctl_sched_migration_cost;
-}
-
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns 1, if task migration degrades locality
@@ -7463,302 +6324,10 @@
}
#endif
-/*
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
- */
-static
-int can_migrate_task(struct task_struct *p, struct lb_env *env)
-{
- int tsk_cache_hot;
-
- lockdep_assert_held(&env->src_rq->lock);
-
- /*
- * We do not migrate tasks that are:
- * 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU.
- */
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
- return 0;
-
- if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
- int cpu;
-
- schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
-
- env->flags |= LBF_SOME_PINNED;
-
- /*
- * Remember if this task can be migrated to any other CPU in
- * our sched_group. We may want to revisit it if we couldn't
- * meet load balance goals by pulling other tasks on src_cpu.
- *
- * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
- * already computed one in current iteration.
- */
- if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
- return 0;
-
- /* Prevent to re-select dst_cpu via env's CPUs: */
- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
- env->flags |= LBF_DST_PINNED;
- env->new_dst_cpu = cpu;
- break;
- }
- }
-
- return 0;
- }
-
- /* Record that we found atleast one task that could run on dst_cpu */
- env->flags &= ~LBF_ALL_PINNED;
-
- if (task_running(env->src_rq, p)) {
- schedstat_inc(p->se.statistics.nr_failed_migrations_running);
- return 0;
- }
-
- /*
- * Aggressive migration if:
- * 1) destination numa is preferred
- * 2) task is cache cold, or
- * 3) too many balance attempts have failed.
- */
- tsk_cache_hot = migrate_degrades_locality(p, env);
- if (tsk_cache_hot == -1)
- tsk_cache_hot = task_hot(p, env);
-
- if (tsk_cache_hot <= 0 ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- schedstat_inc(p->se.statistics.nr_forced_migrations);
- }
- return 1;
- }
-
- schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
- return 0;
-}
-
-/*
- * detach_task() -- detach the task for the migration specified in env
- */
-static void detach_task(struct task_struct *p, struct lb_env *env)
-{
- lockdep_assert_held(&env->src_rq->lock);
-
- deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
- set_task_cpu(p, env->dst_cpu);
-}
-/*
- * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
- * part of active balancing operations within "domain".
- *
- * Returns a task if successful and NULL otherwise.
- */
-static struct task_struct *detach_one_task(struct lb_env *env)
-{
- struct task_struct *p;
-
- lockdep_assert_held(&env->src_rq->lock);
-
- list_for_each_entry_reverse(p,
- &env->src_rq->cfs_tasks, se.group_node) {
- if (!can_migrate_task(p, env))
- continue;
-
- detach_task(p, env);
-
- /*
- * Right now, this is only the second place where
- * lb_gained[env->idle] is updated (other is detach_tasks)
- * so we can safely collect stats here rather than
- * inside detach_tasks().
- */
- schedstat_inc(env->sd->lb_gained[env->idle]);
- return p;
- }
- return NULL;
-}
static const unsigned int sched_nr_migrate_break = 32;
-/*
- * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
- * busiest_rq, as part of a balancing operation within domain "sd".
- *
- * Returns number of detached tasks if successful and 0 otherwise.
- */
-static int detach_tasks(struct lb_env *env)
-{
- struct list_head *tasks = &env->src_rq->cfs_tasks;
- unsigned long util, load;
- struct task_struct *p;
- int detached = 0;
-
- lockdep_assert_held(&env->src_rq->lock);
-
- if (env->imbalance <= 0)
- return 0;
-
- while (!list_empty(tasks)) {
- /*
- * We don't want to steal all, otherwise we may be treated likewise,
- * which could at worst lead to a livelock crash.
- */
- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
- break;
-
- p = list_last_entry(tasks, struct task_struct, se.group_node);
-
- env->loop++;
- /* We've more or less seen every task there is, call it quits */
- if (env->loop > env->loop_max)
- break;
-
- /* take a breather every nr_migrate tasks */
- if (env->loop > env->loop_break) {
- env->loop_break += sched_nr_migrate_break;
- env->flags |= LBF_NEED_BREAK;
- break;
- }
-
- if (!can_migrate_task(p, env))
- goto next;
-
- switch (env->migration_type) {
- case migrate_load:
- load = task_h_load(p);
-
- if (sched_feat(LB_MIN) &&
- load < 16 && !env->sd->nr_balance_failed)
- goto next;
-
- /*
- * Make sure that we don't migrate too much load.
- * Nevertheless, let relax the constraint if
- * scheduler fails to find a good waiting task to
- * migrate.
- */
- if (load/2 > env->imbalance &&
- env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
- goto next;
-
- env->imbalance -= load;
- break;
-
- case migrate_util:
- util = task_util_est(p);
-
- if (util > env->imbalance)
- goto next;
-
- env->imbalance -= util;
- break;
-
- case migrate_task:
- env->imbalance--;
- break;
-
- case migrate_misfit:
- /* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
- goto next;
-
- env->imbalance = 0;
- break;
- }
-
- detach_task(p, env);
- list_add(&p->se.group_node, &env->tasks);
-
- detached++;
-
-#ifdef CONFIG_PREEMPTION
- /*
- * NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is detached to minimize
- * the critical section.
- */
- if (env->idle == CPU_NEWLY_IDLE)
- break;
-#endif
-
- /*
- * We only want to steal up to the prescribed amount of
- * load/util/tasks.
- */
- if (env->imbalance <= 0)
- break;
-
- continue;
-next:
- list_move(&p->se.group_node, tasks);
- }
-
- /*
- * Right now, this is one of only two places we collect this stat
- * so we can safely collect detach_one_task() stats here rather
- * than inside detach_one_task().
- */
- schedstat_add(env->sd->lb_gained[env->idle], detached);
-
- return detached;
-}
-
-/*
- * attach_task() -- attach the task detached by detach_task() to its new rq.
- */
-static void attach_task(struct rq *rq, struct task_struct *p)
-{
- lockdep_assert_held(&rq->lock);
-
- BUG_ON(task_rq(p) != rq);
- activate_task(rq, p, ENQUEUE_NOCLOCK);
- check_preempt_curr(rq, p, 0);
-}
-
-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
- struct rq_flags rf;
-
- rq_lock(rq, &rf);
- update_rq_clock(rq);
- attach_task(rq, p);
- rq_unlock(rq, &rf);
-}
-
-/*
- * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
- * new rq.
- */
-static void attach_tasks(struct lb_env *env)
-{
- struct list_head *tasks = &env->tasks;
- struct task_struct *p;
- struct rq_flags rf;
-
- rq_lock(env->dst_rq, &rf);
- update_rq_clock(env->dst_rq);
-
- while (!list_empty(tasks)) {
- p = list_first_entry(tasks, struct task_struct, se.group_node);
- list_del_init(&p->se.group_node);
-
- attach_task(env->dst_rq, p);
- }
-
- rq_unlock(env->dst_rq, &rf);
-}
#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
@@ -9086,293 +7655,6 @@
) / SCHED_CAPACITY_SCALE;
}
-/******* find_busiest_group() helpers end here *********************/
-
-/*
- * Decision matrix according to the local and busiest group type:
- *
- * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
- * has_spare nr_idle balanced N/A N/A balanced balanced
- * fully_busy nr_idle nr_idle N/A N/A balanced balanced
- * misfit_task force N/A N/A N/A force force
- * asym_packing force force N/A N/A force force
- * imbalanced force force N/A N/A force force
- * overloaded force force N/A N/A force avg_load
- *
- * N/A : Not Applicable because already filtered while updating
- * statistics.
- * balanced : The system is balanced for these 2 groups.
- * force : Calculate the imbalance as load migration is probably needed.
- * avg_load : Only if imbalance is significant enough.
- * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
- * different in groups.
- */
-
-/**
- * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance.
- *
- * Also calculates the amount of runnable load which should be moved
- * to restore balance.
- *
- * @env: The load balancing environment.
- *
- * Return: - The busiest group if imbalance exists.
- */
-static struct sched_group *find_busiest_group(struct lb_env *env)
-{
- struct sg_lb_stats *local, *busiest;
- struct sd_lb_stats sds;
-
- init_sd_lb_stats(&sds);
-
- /*
- * Compute the various statistics relevant for load balancing at
- * this level.
- */
- update_sd_lb_stats(env, &sds);
-
- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
-
- local = &sds.local_stat;
- busiest = &sds.busiest_stat;
-
- /* There is no busy sibling group to pull tasks from */
- if (!sds.busiest)
- goto out_balanced;
-
- /* Misfit tasks should be dealt with regardless of the avg load */
- if (busiest->group_type == group_misfit_task)
- goto force_balance;
-
- /* ASYM feature bypasses nice load balance check */
- if (busiest->group_type == group_asym_packing)
- goto force_balance;
-
- /*
- * If the busiest group is imbalanced the below checks don't
- * work because they assume all things are equal, which typically
- * isn't true due to cpus_ptr constraints and the like.
- */
- if (busiest->group_type == group_imbalanced)
- goto force_balance;
-
- /*
- * If the local group is busier than the selected busiest group
- * don't try and pull any tasks.
- */
- if (local->group_type > busiest->group_type)
- goto out_balanced;
-
- /*
- * When groups are overloaded, use the avg_load to ensure fairness
- * between tasks.
- */
- if (local->group_type == group_overloaded) {
- /*
- * If the local group is more loaded than the selected
- * busiest group don't try to pull any tasks.
- */
- if (local->avg_load >= busiest->avg_load)
- goto out_balanced;
-
- /* XXX broken for overlapping NUMA groups */
- sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
- sds.total_capacity;
-
- /*
- * Don't pull any tasks if this group is already above the
- * domain average load.
- */
- if (local->avg_load >= sds.avg_load)
- goto out_balanced;
-
- /*
- * If the busiest group is more loaded, use imbalance_pct to be
- * conservative.
- */
- if (100 * busiest->avg_load <=
- env->sd->imbalance_pct * local->avg_load)
- goto out_balanced;
- }
-
- /* Try to move all excess tasks to child's sibling domain */
- if (sds.prefer_sibling && local->group_type == group_has_spare &&
- busiest->sum_nr_running > local->sum_nr_running + 1)
- goto force_balance;
-
- if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE)
- /*
- * If the busiest group is not overloaded (and as a
- * result the local one too) but this CPU is already
- * busy, let another idle CPU try to pull task.
- */
- goto out_balanced;
-
- if (busiest->group_weight > 1 &&
- local->idle_cpus <= (busiest->idle_cpus + 1))
- /*
- * If the busiest group is not overloaded
- * and there is no imbalance between this and busiest
- * group wrt idle CPUs, it is balanced. The imbalance
- * becomes significant if the diff is greater than 1
- * otherwise we might end up to just move the imbalance
- * on another group. Of course this applies only if
- * there is more than 1 CPU per group.
- */
- goto out_balanced;
-
- if (busiest->sum_h_nr_running == 1)
- /*
- * busiest doesn't have any tasks waiting to run
- */
- goto out_balanced;
- }
-
-force_balance:
- /* Looks like there is an imbalance. Compute it */
- calculate_imbalance(env, &sds);
- return env->imbalance ? sds.busiest : NULL;
-
-out_balanced:
- env->imbalance = 0;
- return NULL;
-}
-
-/*
- * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
- */
-static struct rq *find_busiest_queue(struct lb_env *env,
- struct sched_group *group)
-{
- struct rq *busiest = NULL, *rq;
- unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
- unsigned int busiest_nr = 0;
- int i;
-
- for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, load, util;
- unsigned int nr_running;
- enum fbq_type rt;
-
- rq = cpu_rq(i);
- rt = fbq_classify_rq(rq);
-
- /*
- * We classify groups/runqueues into three groups:
- * - regular: there are !numa tasks
- * - remote: there are numa tasks that run on the 'wrong' node
- * - all: there is no distinction
- *
- * In order to avoid migrating ideally placed numa tasks,
- * ignore those when there's better options.
- *
- * If we ignore the actual busiest queue to migrate another
- * task, the next balance pass can still reduce the busiest
- * queue by moving tasks around inside the node.
- *
- * If we cannot move enough load due to this classification
- * the next pass will adjust the group classification and
- * allow migration of more tasks.
- *
- * Both cases only affect the total convergence complexity.
- */
- if (rt > env->fbq_type)
- continue;
-
- capacity = capacity_of(i);
- nr_running = rq->cfs.h_nr_running;
-
- /*
- * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
- * eventually lead to active_balancing high->low capacity.
- * Higher per-CPU capacity is considered better than balancing
- * average load.
- */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- capacity_of(env->dst_cpu) < capacity &&
- nr_running == 1)
- continue;
-
- switch (env->migration_type) {
- case migrate_load:
- /*
- * When comparing with load imbalance, use cpu_load()
- * which is not scaled with the CPU capacity.
- */
- load = cpu_load(rq);
-
- if (nr_running == 1 && load > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
- break;
-
- /*
- * For the load comparisons with the other CPUs,
- * consider the cpu_load() scaled with the CPU
- * capacity, so that the load can be moved away
- * from the CPU that is potentially running at a
- * lower capacity.
- *
- * Thus we're looking for max(load_i / capacity_i),
- * crosswise multiplication to rid ourselves of the
- * division works out to:
- * load_i * capacity_j > load_j * capacity_i;
- * where j is our previous maximum.
- */
- if (load * busiest_capacity > busiest_load * capacity) {
- busiest_load = load;
- busiest_capacity = capacity;
- busiest = rq;
- }
- break;
-
- case migrate_util:
- util = cpu_util(cpu_of(rq));
-
- /*
- * Don't try to pull utilization from a CPU with one
- * running task. Whatever its utilization, we will fail
- * detach the task.
- */
- if (nr_running <= 1)
- continue;
-
- if (busiest_util < util) {
- busiest_util = util;
- busiest = rq;
- }
- break;
-
- case migrate_task:
- if (busiest_nr < nr_running) {
- busiest_nr = nr_running;
- busiest = rq;
- }
- break;
-
- case migrate_misfit:
- /*
- * For ASYM_CPUCAPACITY domains with misfit tasks we
- * simply seek the "biggest" misfit task.
- */
- if (rq->misfit_task_load > busiest_load) {
- busiest_load = rq->misfit_task_load;
- busiest = rq;
- }
-
- break;
-
- }
- }
-
- return busiest;
-}
/*
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
@@ -9419,334 +7701,6 @@
return 0;
}
-static int need_active_balance(struct lb_env *env)
-{
- struct sched_domain *sd = env->sd;
-
- if (voluntary_active_balance(env))
- return 1;
-
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
-}
-
-static int active_load_balance_cpu_stop(void *data);
-
-static int should_we_balance(struct lb_env *env)
-{
- struct sched_group *sg = env->sd->groups;
- int cpu, balance_cpu = -1;
-
- /*
- * Ensure the balancing environment is consistent; can happen
- * when the softirq triggers 'during' hotplug.
- */
- if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
- return 0;
-
- /*
- * In the newly idle case, we will allow all the CPUs
- * to do the newly idle load balance.
- */
- if (env->idle == CPU_NEWLY_IDLE)
- return 1;
-
- /* Try to find first idle CPU */
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
- if (!idle_cpu(cpu))
- continue;
-
- balance_cpu = cpu;
- break;
- }
-
- if (balance_cpu == -1)
- balance_cpu = group_balance_cpu(sg);
-
- /*
- * First idle CPU or the first CPU(busiest) in this sched group
- * is eligible for doing load balancing at this and above domains.
- */
- return balance_cpu == env->dst_cpu;
-}
-
-/*
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
- * tasks if there is an imbalance.
- */
-static int load_balance(int this_cpu, struct rq *this_rq,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *continue_balancing)
-{
- int ld_moved, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
- struct sched_group *group;
- struct rq *busiest;
- struct rq_flags rf;
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
-
- struct lb_env env = {
- .sd = sd,
- .dst_cpu = this_cpu,
- .dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
- .idle = idle,
- .loop_break = sched_nr_migrate_break,
- .cpus = cpus,
- .fbq_type = all,
- .tasks = LIST_HEAD_INIT(env.tasks),
- };
-
- cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
-
- schedstat_inc(sd->lb_count[idle]);
-
-redo:
- if (!should_we_balance(&env)) {
- *continue_balancing = 0;
- goto out_balanced;
- }
-
- group = find_busiest_group(&env);
- if (!group) {
- schedstat_inc(sd->lb_nobusyg[idle]);
- goto out_balanced;
- }
-
- busiest = find_busiest_queue(&env, group);
- if (!busiest) {
- schedstat_inc(sd->lb_nobusyq[idle]);
- goto out_balanced;
- }
-
- BUG_ON(busiest == env.dst_rq);
-
- schedstat_add(sd->lb_imbalance[idle], env.imbalance);
-
- env.src_cpu = busiest->cpu;
- env.src_rq = busiest;
-
- ld_moved = 0;
- if (busiest->nr_running > 1) {
- /*
- * Attempt to move tasks. If find_busiest_group has found
- * an imbalance but busiest->nr_running <= 1, the group is
- * still unbalanced. ld_moved simply stays zero, so it is
- * correctly treated as an imbalance.
- */
- env.flags |= LBF_ALL_PINNED;
- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
-
-more_balance:
- rq_lock_irqsave(busiest, &rf);
- update_rq_clock(busiest);
-
- /*
- * cur_ld_moved - load moved in current iteration
- * ld_moved - cumulative load moved across iterations
- */
- cur_ld_moved = detach_tasks(&env);
-
- /*
- * We've detached some tasks from busiest_rq. Every
- * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
- * unlock busiest->lock, and we are able to be sure
- * that nobody can manipulate the tasks in parallel.
- * See task_rq_lock() family for the details.
- */
-
- rq_unlock(busiest, &rf);
-
- if (cur_ld_moved) {
- attach_tasks(&env);
- ld_moved += cur_ld_moved;
- }
-
- local_irq_restore(rf.flags);
-
- if (env.flags & LBF_NEED_BREAK) {
- env.flags &= ~LBF_NEED_BREAK;
- goto more_balance;
- }
-
- /*
- * Revisit (affine) tasks on src_cpu that couldn't be moved to
- * us and move them to an alternate dst_cpu in our sched_group
- * where they can run. The upper limit on how many times we
- * iterate on same src_cpu is dependent on number of CPUs in our
- * sched_group.
- *
- * This changes load balance semantics a bit on who can move
- * load to a given_cpu. In addition to the given_cpu itself
- * (or a ilb_cpu acting on its behalf where given_cpu is
- * nohz-idle), we now have balance_cpu in a position to move
- * load to given_cpu. In rare situations, this may cause
- * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
- * _independently_ and at _same_ time to move some load to
- * given_cpu) causing exceess load to be moved to given_cpu.
- * This however should not happen so much in practice and
- * moreover subsequent load balance cycles should correct the
- * excess load moved.
- */
- if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
-
- /* Prevent to re-select dst_cpu via env's CPUs */
- __cpumask_clear_cpu(env.dst_cpu, env.cpus);
-
- env.dst_rq = cpu_rq(env.new_dst_cpu);
- env.dst_cpu = env.new_dst_cpu;
- env.flags &= ~LBF_DST_PINNED;
- env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
-
- /*
- * Go back to "more_balance" rather than "redo" since we
- * need to continue with same src_cpu.
- */
- goto more_balance;
- }
-
- /*
- * We failed to reach balance because of affinity.
- */
- if (sd_parent) {
- int *group_imbalance = &sd_parent->groups->sgc->imbalance;
-
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
- *group_imbalance = 1;
- }
-
- /* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(env.flags & LBF_ALL_PINNED)) {
- __cpumask_clear_cpu(cpu_of(busiest), cpus);
- /*
- * Attempting to continue load balancing at the current
- * sched_domain level only makes sense if there are
- * active CPUs remaining as possible busiest CPUs to
- * pull load from which are not contained within the
- * destination group that is receiving any migrated
- * load.
- */
- if (!cpumask_subset(cpus, env.dst_grpmask)) {
- env.loop = 0;
- env.loop_break = sched_nr_migrate_break;
- goto redo;
- }
- goto out_all_pinned;
- }
- }
-
- if (!ld_moved) {
- schedstat_inc(sd->lb_failed[idle]);
- /*
- * Increment the failure counter only on periodic balance.
- * We do not want newidle balance, which can be very
- * frequent, pollute the failure counter causing
- * excessive cache_hot migrations and active balances.
- */
- if (idle != CPU_NEWLY_IDLE)
- sd->nr_balance_failed++;
-
- if (need_active_balance(&env)) {
- unsigned long flags;
-
- raw_spin_lock_irqsave(&busiest->lock, flags);
-
- /*
- * Don't kick the active_load_balance_cpu_stop,
- * if the curr task on busiest CPU can't be
- * moved to this_cpu:
- */
- if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
- raw_spin_unlock_irqrestore(&busiest->lock,
- flags);
- env.flags |= LBF_ALL_PINNED;
- goto out_one_pinned;
- }
-
- /*
- * ->active_balance synchronizes accesses to
- * ->active_balance_work. Once set, it's cleared
- * only after active load balance is finished.
- */
- if (!busiest->active_balance) {
- busiest->active_balance = 1;
- busiest->push_cpu = this_cpu;
- active_balance = 1;
- }
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
-
- if (active_balance) {
- stop_one_cpu_nowait(cpu_of(busiest),
- active_load_balance_cpu_stop, busiest,
- &busiest->active_balance_work);
- }
-
- /* We've kicked active balancing, force task migration. */
- sd->nr_balance_failed = sd->cache_nice_tries+1;
- }
- } else
- sd->nr_balance_failed = 0;
-
- if (likely(!active_balance) || voluntary_active_balance(&env)) {
- /* We were unbalanced, so reset the balancing interval */
- sd->balance_interval = sd->min_interval;
- } else {
- /*
- * If we've begun active balancing, start to back off. This
- * case may not be covered by the all_pinned logic if there
- * is only 1 task on the busy runqueue (because we don't call
- * detach_tasks).
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
- }
-
- goto out;
-
-out_balanced:
- /*
- * We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag only if other tasks got
- * a chance to move and fix the imbalance.
- */
- if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
- int *group_imbalance = &sd_parent->groups->sgc->imbalance;
-
- if (*group_imbalance)
- *group_imbalance = 0;
- }
-
-out_all_pinned:
- /*
- * We reach balance because all tasks are pinned at this level so
- * we can't migrate them. Let the imbalance flag set so parent level
- * can try to migrate them.
- */
- schedstat_inc(sd->lb_balanced[idle]);
-
- sd->nr_balance_failed = 0;
-
-out_one_pinned:
- ld_moved = 0;
-
- /*
- * newidle_balance() disregards balance intervals, so we could
- * repeatedly reach this code, which would lead to balance_interval
- * skyrocketting in a short amount of time. Skip the balance_interval
- * increase logic to avoid that.
- */
- if (env.idle == CPU_NEWLY_IDLE)
- goto out;
-
- /* tune up the balancing interval */
- if ((env.flags & LBF_ALL_PINNED &&
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
- sd->balance_interval < sd->max_interval)
- sd->balance_interval *= 2;
-out:
- return ld_moved;
-}
-
static inline unsigned long
get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
{
@@ -9776,99 +7730,6 @@
}
/*
- * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
- * running tasks off the busiest CPU onto idle CPUs. It requires at
- * least 1 task to be running on each physical CPU where possible, and
- * avoids physical / logical imbalances.
- */
-static int active_load_balance_cpu_stop(void *data)
-{
- struct rq *busiest_rq = data;
- int busiest_cpu = cpu_of(busiest_rq);
- int target_cpu = busiest_rq->push_cpu;
- struct rq *target_rq = cpu_rq(target_cpu);
- struct sched_domain *sd;
- struct task_struct *p = NULL;
- struct rq_flags rf;
-
- rq_lock_irq(busiest_rq, &rf);
- /*
- * Between queueing the stop-work and running it is a hole in which
- * CPUs can become inactive. We should not move tasks from or to
- * inactive CPUs.
- */
- if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
- goto out_unlock;
-
- /* Make sure the requested CPU hasn't gone down in the meantime: */
- if (unlikely(busiest_cpu != smp_processor_id() ||
- !busiest_rq->active_balance))
- goto out_unlock;
-
- /* Is there any task to move? */
- if (busiest_rq->nr_running <= 1)
- goto out_unlock;
-
- /*
- * This condition is "impossible", if it occurs
- * we need to fix it. Originally reported by
- * Bjorn Helgaas on a 128-CPU setup.
- */
- BUG_ON(busiest_rq == target_rq);
-
- /* Search for an sd spanning us and the target CPU. */
- rcu_read_lock();
- for_each_domain(target_cpu, sd) {
- if ((sd->flags & SD_LOAD_BALANCE) &&
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
- break;
- }
-
- if (likely(sd)) {
- struct lb_env env = {
- .sd = sd,
- .dst_cpu = target_cpu,
- .dst_rq = target_rq,
- .src_cpu = busiest_rq->cpu,
- .src_rq = busiest_rq,
- .idle = CPU_IDLE,
- /*
- * can_migrate_task() doesn't need to compute new_dst_cpu
- * for active balancing. Since we have CPU_IDLE, but no
- * @dst_grpmask we need to make that test go away with lying
- * about DST_PINNED.
- */
- .flags = LBF_DST_PINNED,
- };
-
- schedstat_inc(sd->alb_count);
- update_rq_clock(busiest_rq);
-
- p = detach_one_task(&env);
- if (p) {
- schedstat_inc(sd->alb_pushed);
- /* Active balancing done, reset the failure counter. */
- sd->nr_balance_failed = 0;
- } else {
- schedstat_inc(sd->alb_failed);
- }
- }
- rcu_read_unlock();
-out_unlock:
- busiest_rq->active_balance = 0;
- rq_unlock(busiest_rq, &rf);
-
- if (p)
- attach_one_task(target_rq, p);
-
- local_irq_enable();
-
- return 0;
-}
-
-static DEFINE_SPINLOCK(balancing);
-
-/*
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
@@ -9877,114 +7738,6 @@
max_load_balance_interval = HZ*num_online_cpus()/10;
}
-/*
- * It checks each scheduling domain to see if it is due to be balanced,
- * and initiates a balancing operation if so.
- *
- * Balancing parameters are set up in init_sched_domains.
- */
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
-{
- int continue_balancing = 1;
- int cpu = rq->cpu;
- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
- unsigned long interval;
- struct sched_domain *sd;
- /* Earliest time when we have to do rebalance again */
- unsigned long next_balance = jiffies + 60*HZ;
- int update_next_balance = 0;
- int need_serialize, need_decay = 0;
- u64 max_cost = 0;
-
- rcu_read_lock();
- for_each_domain(cpu, sd) {
- /*
- * Decay the newidle max times here because this is a regular
- * visit to all the domains. Decay ~1% per second.
- */
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
- sd->max_newidle_lb_cost =
- (sd->max_newidle_lb_cost * 253) / 256;
- sd->next_decay_max_lb_cost = jiffies + HZ;
- need_decay = 1;
- }
- max_cost += sd->max_newidle_lb_cost;
-
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
- /*
- * Stop the load balance at this level. There is another
- * CPU in our sched group which is doing load balancing more
- * actively.
- */
- if (!continue_balancing) {
- if (need_decay)
- continue;
- break;
- }
-
- interval = get_sd_balance_interval(sd, busy);
-
- need_serialize = sd->flags & SD_SERIALIZE;
- if (need_serialize) {
- if (!spin_trylock(&balancing))
- goto out;
- }
-
- if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
- /*
- * The LBF_DST_PINNED logic could have changed
- * env->dst_cpu, so we can't know our idle
- * state even if we migrated tasks. Update it.
- */
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
- }
- sd->last_balance = jiffies;
- interval = get_sd_balance_interval(sd, busy);
- }
- if (need_serialize)
- spin_unlock(&balancing);
-out:
- if (time_after(next_balance, sd->last_balance + interval)) {
- next_balance = sd->last_balance + interval;
- update_next_balance = 1;
- }
- }
- if (need_decay) {
- /*
- * Ensure the rq-wide value also decays but keep it at a
- * reasonable floor to avoid funnies with rq->avg_idle.
- */
- rq->max_idle_balance_cost =
- max((u64)sysctl_sched_migration_cost, max_cost);
- }
- rcu_read_unlock();
-
- /*
- * next_balance will be updated only when there is a need.
- * When the cpu is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance)) {
- rq->next_balance = next_balance;
-
-#ifdef CONFIG_NO_HZ_COMMON
- /*
- * If this CPU has been elected to perform the nohz idle
- * balance. Other idle CPUs have already rebalanced with
- * nohz_idle_balance() and nohz.next_balance has been
- * updated accordingly. This CPU is now running the idle load
- * balance for itself and we need to update the
- * nohz.next_balance accordingly.
- */
- if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
- nohz.next_balance = rq->next_balance;
-#endif
- }
-}
static inline int on_null_domain(struct rq *rq)
{
@@ -10014,420 +7767,12 @@
return nr_cpu_ids;
}
-/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
- */
-static void kick_ilb(unsigned int flags)
-{
- int ilb_cpu;
-
- nohz.next_balance++;
-
- ilb_cpu = find_new_ilb();
-
- if (ilb_cpu >= nr_cpu_ids)
- return;
-
- flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
- if (flags & NOHZ_KICK_MASK)
- return;
-
- /*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target CPU which
- * is idle. And the softirq performing nohz idle load balance
- * will be run before returning from the IPI.
- */
- smp_send_reschedule(ilb_cpu);
-}
-
-/*
- * Current decision point for kicking the idle load balancer in the presence
- * of idle CPUs in the system.
- */
-static void nohz_balancer_kick(struct rq *rq)
-{
- unsigned long now = jiffies;
- struct sched_domain_shared *sds;
- struct sched_domain *sd;
- int nr_busy, i, cpu = rq->cpu;
- unsigned int flags = 0;
-
- if (unlikely(rq->idle_balance))
- return;
-
- /*
- * We may be recently in ticked or tickless idle mode. At the first
- * busy tick after returning from idle, we will update the busy stats.
- */
- nohz_balance_exit_idle(rq);
-
- /*
- * None are in tickless mode and hence no need for NOHZ idle load
- * balancing.
- */
- if (likely(!atomic_read(&nohz.nr_cpus)))
- return;
-
- if (READ_ONCE(nohz.has_blocked) &&
- time_after(now, READ_ONCE(nohz.next_blocked)))
- flags = NOHZ_STATS_KICK;
-
- if (time_before(now, nohz.next_balance))
- goto out;
-
- if (rq->nr_running >= 2) {
- flags = NOHZ_KICK_MASK;
- goto out;
- }
-
- rcu_read_lock();
-
- sd = rcu_dereference(rq->sd);
- if (sd) {
- /*
- * If there's a CFS task and the current CPU has reduced
- * capacity; kick the ILB to see if there's a better CPU to run
- * on.
- */
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
- flags = NOHZ_KICK_MASK;
- goto unlock;
- }
- }
-
- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
- if (sd) {
- /*
- * When ASYM_PACKING; see if there's a more preferred CPU
- * currently idle; in which case, kick the ILB to move tasks
- * around.
- */
- for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym_prefer(i, cpu)) {
- flags = NOHZ_KICK_MASK;
- goto unlock;
- }
- }
- }
-
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
- if (sd) {
- /*
- * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
- * to run the misfit task on.
- */
- if (check_misfit_status(rq, sd)) {
- flags = NOHZ_KICK_MASK;
- goto unlock;
- }
-
- /*
- * For asymmetric systems, we do not want to nicely balance
- * cache use, instead we want to embrace asymmetry and only
- * ensure tasks have enough CPU capacity.
- *
- * Skip the LLC logic because it's not relevant in that case.
- */
- goto unlock;
- }
-
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds) {
- /*
- * If there is an imbalance between LLC domains (IOW we could
- * increase the overall cache use), we need some less-loaded LLC
- * domain to pull some load. Likewise, we may need to spread
- * load within the current LLC domain (e.g. packed SMT cores but
- * other CPUs are idle). We can't really know from here how busy
- * the others are - so just get a nohz balance going if it looks
- * like this LLC domain has tasks we could move.
- */
- nr_busy = atomic_read(&sds->nr_busy_cpus);
- if (nr_busy > 1) {
- flags = NOHZ_KICK_MASK;
- goto unlock;
- }
- }
-unlock:
- rcu_read_unlock();
-out:
- if (flags)
- kick_ilb(flags);
-}
-
-static void set_cpu_sd_state_busy(int cpu)
-{
- struct sched_domain *sd;
-
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
- if (!sd || !sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 0;
-
- atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
-}
-
void nohz_balance_exit_idle(struct rq *rq)
{
- SCHED_WARN_ON(rq != this_rq());
-
- if (likely(!rq->nohz_tick_stopped))
- return;
-
- rq->nohz_tick_stopped = 0;
- cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
-
- set_cpu_sd_state_busy(rq->cpu);
}
-static void set_cpu_sd_state_idle(int cpu)
-{
- struct sched_domain *sd;
-
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
- if (!sd || sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 1;
-
- atomic_dec(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
-}
-
-/*
- * This routine will record that the CPU is going idle with tick stopped.
- * This info will be used in performing idle load balancing in the future.
- */
void nohz_balance_enter_idle(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
-
- SCHED_WARN_ON(cpu != smp_processor_id());
-
- /* If this CPU is going down, then nothing needs to be done: */
- if (!cpu_active(cpu))
- return;
-
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
- return;
-
- /*
- * Can be set safely without rq->lock held
- * If a clear happens, it will have evaluated last additions because
- * rq->lock is held during the check and the clear
- */
- rq->has_blocked_load = 1;
-
- /*
- * The tick is still stopped but load could have been added in the
- * meantime. We set the nohz.has_blocked flag to trig a check of the
- * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
- * of nohz.has_blocked can only happen after checking the new load
- */
- if (rq->nohz_tick_stopped)
- goto out;
-
- /* If we're a completely isolated CPU, we don't play: */
- if (on_null_domain(rq))
- return;
-
- rq->nohz_tick_stopped = 1;
-
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
-
- /*
- * Ensures that if nohz_idle_balance() fails to observe our
- * @idle_cpus_mask store, it must observe the @has_blocked
- * store.
- */
- smp_mb__after_atomic();
-
- set_cpu_sd_state_idle(cpu);
-
-out:
- /*
- * Each time a cpu enter idle, we assume that it has blocked load and
- * enable the periodic update of the load of idle cpus
- */
- WRITE_ONCE(nohz.has_blocked, 1);
-}
-
-/*
- * Internal function that runs load balance for all idle cpus. The load balance
- * can be a simple update of blocked load or a complete load balance with
- * tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
- */
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
- enum cpu_idle_type idle)
-{
- /* Earliest time when we have to do rebalance again */
- unsigned long now = jiffies;
- unsigned long next_balance = now + 60*HZ;
- bool has_blocked_load = false;
- int update_next_balance = 0;
- int this_cpu = this_rq->cpu;
- int balance_cpu;
- int ret = false;
- struct rq *rq;
-
- SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
-
- /*
- * We assume there will be no idle load after this update and clear
- * the has_blocked flag. If a cpu enters idle in the mean time, it will
- * set the has_blocked flag and trig another update of idle load.
- * Because a cpu that becomes idle, is added to idle_cpus_mask before
- * setting the flag, we are sure to not clear the state and not
- * check the load of an idle cpu.
- */
- WRITE_ONCE(nohz.has_blocked, 0);
-
- /*
- * Ensures that if we miss the CPU, we must see the has_blocked
- * store from nohz_balance_enter_idle().
- */
- smp_mb();
-
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
- continue;
-
- /*
- * If this CPU gets work to do, stop the load balancing
- * work being done for other CPUs. Next load
- * balancing owner will pick it up.
- */
- if (need_resched()) {
- has_blocked_load = true;
- goto abort;
- }
-
- rq = cpu_rq(balance_cpu);
-
- has_blocked_load |= update_nohz_stats(rq, true);
-
- /*
- * If time for next balance is due,
- * do the balance.
- */
- if (time_after_eq(jiffies, rq->next_balance)) {
- struct rq_flags rf;
-
- rq_lock_irqsave(rq, &rf);
- update_rq_clock(rq);
- rq_unlock_irqrestore(rq, &rf);
-
- if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(rq, CPU_IDLE);
- }
-
- if (time_after(next_balance, rq->next_balance)) {
- next_balance = rq->next_balance;
- update_next_balance = 1;
- }
- }
-
- /* Newly idle CPU doesn't need an update */
- if (idle != CPU_NEWLY_IDLE) {
- update_blocked_averages(this_cpu);
- has_blocked_load |= this_rq->has_blocked_load;
- }
-
- if (flags & NOHZ_BALANCE_KICK)
- rebalance_domains(this_rq, CPU_IDLE);
-
- WRITE_ONCE(nohz.next_blocked,
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
-
- /* The full idle balance loop has been done */
- ret = true;
-
-abort:
- /* There is still blocked load, enable periodic update */
- if (has_blocked_load)
- WRITE_ONCE(nohz.has_blocked, 1);
-
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
- return ret;
-}
-
-/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
- */
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
-{
- int this_cpu = this_rq->cpu;
- unsigned int flags;
-
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
- return false;
-
- if (idle != CPU_IDLE) {
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- return false;
- }
-
- /* could be _relaxed() */
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
- if (!(flags & NOHZ_KICK_MASK))
- return false;
-
- _nohz_idle_balance(this_rq, flags, idle);
-
- return true;
-}
-
-static void nohz_newidle_balance(struct rq *this_rq)
-{
- int this_cpu = this_rq->cpu;
-
- /*
- * This CPU doesn't want to be disturbed by scheduler
- * housekeeping
- */
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
- return;
-
- /* Will wake up very soon. No time for doing anything else*/
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
- return;
-
- /* Don't need to update blocked load of idle CPUs*/
- if (!READ_ONCE(nohz.has_blocked) ||
- time_before(jiffies, READ_ONCE(nohz.next_blocked)))
- return;
-
- raw_spin_unlock(&this_rq->lock);
- /*
- * This CPU is going to be idle and blocked load of idle CPUs
- * need to be updated. Run the ilb locally as it is a good
- * candidate for ilb instead of waking up another idle CPU.
- * Kick an normal ilb if we failed to do the update.
- */
- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
- kick_ilb(NOHZ_STATS_KICK);
- raw_spin_lock(&this_rq->lock);
}
#else /* !CONFIG_NO_HZ_COMMON */
@@ -10441,169 +7786,6 @@
static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */
-/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- *
- * Returns:
- * < 0 - we released the lock and there are !fair tasks present
- * 0 - failed, no new tasks
- * > 0 - success, new (fair) tasks present
- */
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
-{
- unsigned long next_balance = jiffies + HZ;
- int this_cpu = this_rq->cpu;
- struct sched_domain *sd;
- int pulled_task = 0;
- u64 curr_cost = 0;
-
- update_misfit_status(NULL, this_rq);
- /*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
- */
- this_rq->idle_stamp = rq_clock(this_rq);
-
- /*
- * Do not pull tasks towards !active CPUs...
- */
- if (!cpu_active(this_cpu))
- return 0;
-
- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- rq_unpin_lock(this_rq, rf);
-
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !READ_ONCE(this_rq->rd->overload)) {
-
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (sd)
- update_next_balance(sd, &next_balance);
- rcu_read_unlock();
-
- nohz_newidle_balance(this_rq);
-
- goto out;
- }
-
- raw_spin_unlock(&this_rq->lock);
-
- update_blocked_averages(this_cpu);
- rcu_read_lock();
- for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
- u64 t0, domain_cost;
-
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, &next_balance);
- break;
- }
-
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
-
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
-
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
-
- curr_cost += domain_cost;
- }
-
- update_next_balance(sd, &next_balance);
-
- /*
- * Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
- */
- if (pulled_task || this_rq->nr_running > 0)
- break;
- }
- rcu_read_unlock();
-
- raw_spin_lock(&this_rq->lock);
-
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
-out:
- /*
- * While browsing the domains, we released the rq lock, a task could
- * have been enqueued in the meantime. Since we're not going idle,
- * pretend we pulled a task.
- */
- if (this_rq->cfs.h_nr_running && !pulled_task)
- pulled_task = 1;
-
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
- pulled_task = -1;
-
- if (pulled_task)
- this_rq->idle_stamp = 0;
-
- rq_repin_lock(this_rq, rf);
-
- return pulled_task;
-}
-
-/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
- */
-static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
-{
- struct rq *this_rq = this_rq();
- enum cpu_idle_type idle = this_rq->idle_balance ?
- CPU_IDLE : CPU_NOT_IDLE;
-
- /*
- * If this CPU has a pending nohz_balance_kick, then do the
- * balancing on behalf of the other idle CPUs whose ticks are
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
- * give the idle CPUs a chance to load balance. Else we may
- * load balance only within the local sched_domain hierarchy
- * and abort nohz_idle_balance altogether if we pull some load.
- */
- if (nohz_idle_balance(this_rq, idle))
- return;
-
- /* normal load balance */
- update_blocked_averages(this_rq->cpu);
- rebalance_domains(this_rq, idle);
-}
-
-/*
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
- */
-void trigger_load_balance(struct rq *rq)
-{
- /* Don't need to rebalance while attached to NULL domain */
- if (unlikely(on_null_domain(rq)))
- return;
-
- if (time_after_eq(jiffies, rq->next_balance))
- raise_softirq(SCHED_SOFTIRQ);
-
- nohz_balancer_kick(rq);
-}
static void rq_online_fair(struct rq *rq)
{
@@ -10640,9 +7822,6 @@
entity_tick(cfs_rq, se, queued);
}
- if (static_branch_unlikely(&sched_numa_balancing))
- task_tick_numa(rq, curr);
-
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
}
@@ -10655,7 +7834,7 @@
static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se, *curr;
+ struct sched_entity *curr;
struct rq *rq = this_rq();
struct rq_flags rf;
@@ -10666,20 +7845,9 @@
curr = cfs_rq->curr;
if (curr) {
update_curr(cfs_rq);
- se->vruntime = curr->vruntime;
}
- place_entity(cfs_rq, se, 1);
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
- /*
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- swap(curr->vruntime, se->vruntime);
- resched_curr(rq);
- }
- se->vruntime -= cfs_rq->min_vruntime;
rq_unlock(rq, &rf);
}
@@ -10708,58 +7876,9 @@
check_preempt_curr(rq, p, 0);
}
-static inline bool vruntime_normalized(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
-
- /*
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
- * the dequeue_entity(.flags=0) will already have normalized the
- * vruntime.
- */
- if (p->on_rq)
- return true;
-
- /*
- * When !on_rq, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - A forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - A task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- */
- if (!se->sum_exec_runtime ||
- (p->state == TASK_WAKING && p->sched_remote_wakeup))
- return true;
-
- return false;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Propagate the changes of the sched_entity across the tg tree to make it
- * visible to the root
- */
-static void propagate_entity_cfs_rq(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq;
-
- /* Start to propagate at parent */
- se = se->parent;
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (cfs_rq_throttled(cfs_rq))
- break;
-
- update_load_avg(cfs_rq, se, UPDATE_TG);
- }
-}
-#else
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
-#endif
+
static void detach_entity_cfs_rq(struct sched_entity *se)
{
@@ -10776,14 +7895,6 @@
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
@@ -10794,29 +7905,13 @@
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
- if (!vruntime_normalized(p)) {
- /*
- * Fix up our vruntime so that the current sleep doesn't
- * cause 'unlimited' sleep bonus.
- */
- place_entity(cfs_rq, se, 0);
- se->vruntime -= cfs_rq->min_vruntime;
- }
-
detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
attach_entity_cfs_rq(se);
-
- if (!vruntime_normalized(p))
- se->vruntime += cfs_rq->min_vruntime;
}
static void switched_from_fair(struct rq *rq, struct task_struct *p)
@@ -10879,6 +7974,8 @@
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
+
+ cfs_rq->head = NULL;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -11203,7 +8300,6 @@
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
diff --color -rubN linux-5.7.6/kernel/sched/sched.h linux-5.7.6.cachy/kernel/sched/sched.h
--- linux-5.7.6/kernel/sched/sched.h 2020-06-25 01:49:26.000000000 +1000
+++ linux-5.7.6.cachy/kernel/sched/sched.h 2020-07-24 17:52:04.479461959 +1000
@@ -516,6 +516,7 @@
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
+ struct sched_entity *head;
struct sched_entity *curr;
struct sched_entity *next;
struct sched_entity *last;
@@ -541,50 +542,7 @@
unsigned long runnable_avg;
} removed;
-#ifdef CONFIG_FAIR_GROUP_SCHED
- unsigned long tg_load_avg_contrib;
- long propagate;
- long prop_runnable_sum;
-
- /*
- * h_load = weight * f(tg)
- *
- * Where f(tg) is the recursive weight fraction assigned to
- * this group.
- */
- unsigned long h_load;
- u64 last_h_load_update;
- struct sched_entity *h_load_next;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
-
- /*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
- * This list is used during load balance.
- */
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
-
-#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
- s64 runtime_remaining;
-
- u64 throttled_clock;
- u64 throttled_clock_task;
- u64 throttled_clock_task_time;
- int throttled;
- int throttle_count;
- struct list_head throttled_list;
-#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
};
static inline int rt_bandwidth_enabled(void)
diff --color -rubN linux-5.7.6/Makefile linux-5.7.6.cachy/Makefile
--- linux-5.7.6/Makefile 2020-06-25 01:49:26.000000000 +1000
+++ linux-5.7.6.cachy/Makefile 2020-07-24 14:33:53.453645295 +1000
@@ -2,8 +2,8 @@
VERSION = 5
PATCHLEVEL = 7
SUBLEVEL = 6
-EXTRAVERSION =
-NAME = Kleptomaniac Octopus
+EXTRAVERSION = -cachy
+NAME = Cachy
# *DOCUMENTATION*
# To see a list of typical targets execute "make help"