e9fb606121
The previous building scripts and patches will be moved to legacy
3937 lines
106 KiB
Diff
3937 lines
106 KiB
Diff
diff --color -rubN linux-5.7.6/include/linux/sched.h linux-5.7.6.cachy/include/linux/sched.h
|
|
--- linux-5.7.6/include/linux/sched.h 2020-06-25 01:49:26.000000000 +1000
|
|
+++ linux-5.7.6.cachy/include/linux/sched.h 2020-07-24 17:51:45.879582847 +1000
|
|
@@ -452,9 +452,14 @@
|
|
/* For load-balancing: */
|
|
struct load_weight load;
|
|
struct rb_node run_node;
|
|
+
|
|
+ struct sched_entity* next[2];
|
|
+
|
|
struct list_head group_node;
|
|
unsigned int on_rq;
|
|
|
|
+ int quantom;
|
|
+
|
|
u64 exec_start;
|
|
u64 sum_exec_runtime;
|
|
u64 vruntime;
|
|
@@ -464,16 +469,6 @@
|
|
|
|
struct sched_statistics statistics;
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- int depth;
|
|
- struct sched_entity *parent;
|
|
- /* rq on which this entity is (to be) queued: */
|
|
- struct cfs_rq *cfs_rq;
|
|
- /* rq "owned" by this entity/group: */
|
|
- struct cfs_rq *my_q;
|
|
- /* cached value of my_q->h_nr_running */
|
|
- unsigned long runnable_weight;
|
|
-#endif
|
|
|
|
#ifdef CONFIG_SMP
|
|
/*
|
|
diff --color -rubN linux-5.7.6/kernel/sched/core.c linux-5.7.6.cachy/kernel/sched/core.c
|
|
--- linux-5.7.6/kernel/sched/core.c 2020-06-25 01:49:26.000000000 +1000
|
|
+++ linux-5.7.6.cachy/kernel/sched/core.c 2020-07-24 17:51:57.991504128 +1000
|
|
@@ -2672,18 +2672,14 @@
|
|
p->se.prev_sum_exec_runtime = 0;
|
|
p->se.nr_migrations = 0;
|
|
p->se.vruntime = 0;
|
|
- INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- p->se.cfs_rq = NULL;
|
|
-#endif
|
|
+ INIT_LIST_HEAD(&p->se.group_node);
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
/* Even if schedstat is disabled, there should not be garbage */
|
|
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
|
|
#endif
|
|
|
|
- RB_CLEAR_NODE(&p->dl.rb_node);
|
|
init_dl_task_timer(&p->dl);
|
|
init_dl_inactive_task_timer(&p->dl);
|
|
__dl_clear_params(p);
|
|
@@ -3246,31 +3242,10 @@
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
-/* rq->lock is NOT held, but preemption is disabled */
|
|
-static void __balance_callback(struct rq *rq)
|
|
-{
|
|
- struct callback_head *head, *next;
|
|
- void (*func)(struct rq *rq);
|
|
- unsigned long flags;
|
|
-
|
|
- raw_spin_lock_irqsave(&rq->lock, flags);
|
|
- head = rq->balance_callback;
|
|
- rq->balance_callback = NULL;
|
|
- while (head) {
|
|
- func = (void (*)(struct rq *))head->func;
|
|
- next = head->next;
|
|
- head->next = NULL;
|
|
- head = next;
|
|
-
|
|
- func(rq);
|
|
- }
|
|
- raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
-}
|
|
+///* rq->lock is NOT held, but preemption is disabled */
|
|
|
|
static inline void balance_callback(struct rq *rq)
|
|
{
|
|
- if (unlikely(rq->balance_callback))
|
|
- __balance_callback(rq);
|
|
}
|
|
|
|
#else
|
|
@@ -3606,7 +3581,6 @@
|
|
|
|
#ifdef CONFIG_SMP
|
|
rq->idle_balance = idle_cpu(cpu);
|
|
- trigger_load_balance(rq);
|
|
#endif
|
|
}
|
|
|
|
@@ -6574,23 +6548,12 @@
|
|
|
|
wait_bit_init();
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- ptr += 2 * nr_cpu_ids * sizeof(void **);
|
|
-#endif
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
ptr += 2 * nr_cpu_ids * sizeof(void **);
|
|
#endif
|
|
if (ptr) {
|
|
ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- root_task_group.se = (struct sched_entity **)ptr;
|
|
- ptr += nr_cpu_ids * sizeof(void **);
|
|
-
|
|
- root_task_group.cfs_rq = (struct cfs_rq **)ptr;
|
|
- ptr += nr_cpu_ids * sizeof(void **);
|
|
-
|
|
-#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
|
|
ptr += nr_cpu_ids * sizeof(void **);
|
|
@@ -6641,32 +6604,7 @@
|
|
init_cfs_rq(&rq->cfs);
|
|
init_rt_rq(&rq->rt);
|
|
init_dl_rq(&rq->dl);
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
|
- INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
|
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
|
- /*
|
|
- * How much CPU bandwidth does root_task_group get?
|
|
- *
|
|
- * In case of task-groups formed thr' the cgroup filesystem, it
|
|
- * gets 100% of the CPU resources in the system. This overall
|
|
- * system CPU resource is divided among the tasks of
|
|
- * root_task_group and its child task-groups in a fair manner,
|
|
- * based on each entity's (task or task-group's) weight
|
|
- * (se->load.weight).
|
|
- *
|
|
- * In other words, if root_task_group has 10 tasks of weight
|
|
- * 1024) and two child groups A0 and A1 (of weight 1024 each),
|
|
- * then A0's share of the CPU resource is:
|
|
- *
|
|
- * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
|
|
- *
|
|
- * We achieve this by letting root_task_group's tasks sit
|
|
- * directly in rq->cfs (i.e root_task_group->se[] = NULL).
|
|
- */
|
|
- init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
|
- init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
|
|
-#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
+
|
|
|
|
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
diff --color -rubN linux-5.7.6/kernel/sched/debug.c linux-5.7.6.cachy/kernel/sched/debug.c
|
|
--- linux-5.7.6/kernel/sched/debug.c 2020-06-25 01:49:26.000000000 +1000
|
|
+++ linux-5.7.6.cachy/kernel/sched/debug.c 2020-07-24 17:52:15.419390856 +1000
|
|
@@ -385,7 +385,7 @@
|
|
return;
|
|
|
|
PN(se->exec_start);
|
|
- PN(se->vruntime);
|
|
+ //PN(se->vruntime);
|
|
PN(se->sum_exec_runtime);
|
|
|
|
if (schedstat_enabled()) {
|
|
@@ -437,9 +437,9 @@
|
|
else
|
|
SEQ_printf(m, " %c", task_state_to_char(p));
|
|
|
|
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
|
|
- p->comm, task_pid_nr(p),
|
|
- SPLIT_NS(p->se.vruntime),
|
|
+ SEQ_printf(m, "%15s %5d %9d %9Ld %8d ",
|
|
+ p->comm, task_pid_nr(p), p->se.quantom,
|
|
+ //SPLIT_NS(p->se.vruntime),%9Ld.%06ld
|
|
(long long)(p->nvcsw + p->nivcsw),
|
|
p->prio);
|
|
|
|
@@ -464,9 +464,9 @@
|
|
|
|
SEQ_printf(m, "\n");
|
|
SEQ_printf(m, "runnable tasks:\n");
|
|
- SEQ_printf(m, " S task PID tree-key switches prio"
|
|
+ SEQ_printf(m, " S task PID quantom switches prio"
|
|
" wait-time sum-exec sum-sleep\n");
|
|
- SEQ_printf(m, "-------------------------------------------------------"
|
|
+ SEQ_printf(m, "--------------------------------------------------------------------"
|
|
"----------------------------------------------------\n");
|
|
|
|
rcu_read_lock();
|
|
@@ -481,10 +481,8 @@
|
|
|
|
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
|
{
|
|
- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
|
|
- spread, rq0_min_vruntime, spread0;
|
|
struct rq *rq = cpu_rq(cpu);
|
|
- struct sched_entity *last;
|
|
+ //struct sched_entity *last;
|
|
unsigned long flags;
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
@@ -498,26 +496,26 @@
|
|
SPLIT_NS(cfs_rq->exec_clock));
|
|
|
|
raw_spin_lock_irqsave(&rq->lock, flags);
|
|
- if (rb_first_cached(&cfs_rq->tasks_timeline))
|
|
- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
|
|
- last = __pick_last_entity(cfs_rq);
|
|
- if (last)
|
|
- max_vruntime = last->vruntime;
|
|
- min_vruntime = cfs_rq->min_vruntime;
|
|
- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
|
|
+ //if (rb_first_cached(&cfs_rq->tasks_timeline))
|
|
+ //MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
|
|
+ //last = __pick_last_entity(cfs_rq);
|
|
+ //if (last)
|
|
+ //max_vruntime = last->vruntime;
|
|
+ //min_vruntime = cfs_rq->min_vruntime;
|
|
+ //rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
|
|
- SPLIT_NS(MIN_vruntime));
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
|
|
- SPLIT_NS(min_vruntime));
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
|
|
- SPLIT_NS(max_vruntime));
|
|
- spread = max_vruntime - MIN_vruntime;
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
|
|
- SPLIT_NS(spread));
|
|
- spread0 = min_vruntime - rq0_min_vruntime;
|
|
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
|
|
- SPLIT_NS(spread0));
|
|
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
|
|
+ //SPLIT_NS(MIN_vruntime));
|
|
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
|
|
+ //SPLIT_NS(min_vruntime));
|
|
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime",
|
|
+ //SPLIT_NS(max_vruntime));
|
|
+ //spread = max_vruntime - MIN_vruntime;
|
|
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread",
|
|
+ //SPLIT_NS(spread));
|
|
+ //spread0 = min_vruntime - rq0_min_vruntime;
|
|
+ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
|
|
+ //SPLIT_NS(spread0));
|
|
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
|
|
cfs_rq->nr_spread_over);
|
|
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
|
|
@@ -875,7 +873,7 @@
|
|
#define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F))
|
|
|
|
PN(se.exec_start);
|
|
- PN(se.vruntime);
|
|
+ //PN(se.vruntime);
|
|
PN(se.sum_exec_runtime);
|
|
|
|
nr_switches = p->nvcsw + p->nivcsw;
|
|
diff --color -rubN linux-5.7.6/kernel/sched/fair.c linux-5.7.6.cachy/kernel/sched/fair.c
|
|
--- linux-5.7.6/kernel/sched/fair.c 2020-06-25 01:49:26.000000000 +1000
|
|
+++ linux-5.7.6.cachy/kernel/sched/fair.c 2020-07-24 17:52:09.159431543 +1000
|
|
@@ -86,6 +86,9 @@
|
|
|
|
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
|
|
|
+#define DIR_RIGHT 0
|
|
+#define DIR_LEFT 1
|
|
+
|
|
int sched_thermal_decay_shift;
|
|
static int __init setup_sched_thermal_decay_shift(char *str)
|
|
{
|
|
@@ -259,193 +262,6 @@
|
|
* CFS operations on generic schedulable entities:
|
|
*/
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
-static inline struct task_struct *task_of(struct sched_entity *se)
|
|
-{
|
|
- SCHED_WARN_ON(!entity_is_task(se));
|
|
- return container_of(se, struct task_struct, se);
|
|
-}
|
|
-
|
|
-/* Walk up scheduling entities hierarchy */
|
|
-#define for_each_sched_entity(se) \
|
|
- for (; se; se = se->parent)
|
|
-
|
|
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
|
|
-{
|
|
- return p->se.cfs_rq;
|
|
-}
|
|
-
|
|
-/* runqueue on which this entity is (to be) queued */
|
|
-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
|
|
-{
|
|
- return se->cfs_rq;
|
|
-}
|
|
-
|
|
-/* runqueue "owned" by this group */
|
|
-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
|
|
-{
|
|
- return grp->my_q;
|
|
-}
|
|
-
|
|
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
|
|
-{
|
|
- if (!path)
|
|
- return;
|
|
-
|
|
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
|
|
- autogroup_path(cfs_rq->tg, path, len);
|
|
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
|
|
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
|
|
- else
|
|
- strlcpy(path, "(null)", len);
|
|
-}
|
|
-
|
|
-static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
|
-{
|
|
- struct rq *rq = rq_of(cfs_rq);
|
|
- int cpu = cpu_of(rq);
|
|
-
|
|
- if (cfs_rq->on_list)
|
|
- return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
|
|
-
|
|
- cfs_rq->on_list = 1;
|
|
-
|
|
- /*
|
|
- * Ensure we either appear before our parent (if already
|
|
- * enqueued) or force our parent to appear after us when it is
|
|
- * enqueued. The fact that we always enqueue bottom-up
|
|
- * reduces this to two cases and a special case for the root
|
|
- * cfs_rq. Furthermore, it also means that we will always reset
|
|
- * tmp_alone_branch either when the branch is connected
|
|
- * to a tree or when we reach the top of the tree
|
|
- */
|
|
- if (cfs_rq->tg->parent &&
|
|
- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
|
|
- /*
|
|
- * If parent is already on the list, we add the child
|
|
- * just before. Thanks to circular linked property of
|
|
- * the list, this means to put the child at the tail
|
|
- * of the list that starts by parent.
|
|
- */
|
|
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
|
|
- /*
|
|
- * The branch is now connected to its tree so we can
|
|
- * reset tmp_alone_branch to the beginning of the
|
|
- * list.
|
|
- */
|
|
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
|
- return true;
|
|
- }
|
|
-
|
|
- if (!cfs_rq->tg->parent) {
|
|
- /*
|
|
- * cfs rq without parent should be put
|
|
- * at the tail of the list.
|
|
- */
|
|
- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
|
|
- &rq->leaf_cfs_rq_list);
|
|
- /*
|
|
- * We have reach the top of a tree so we can reset
|
|
- * tmp_alone_branch to the beginning of the list.
|
|
- */
|
|
- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
|
|
- return true;
|
|
- }
|
|
-
|
|
- /*
|
|
- * The parent has not already been added so we want to
|
|
- * make sure that it will be put after us.
|
|
- * tmp_alone_branch points to the begin of the branch
|
|
- * where we will add parent.
|
|
- */
|
|
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
|
|
- /*
|
|
- * update tmp_alone_branch to points to the new begin
|
|
- * of the branch
|
|
- */
|
|
- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
|
|
- return false;
|
|
-}
|
|
-
|
|
-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
|
-{
|
|
- if (cfs_rq->on_list) {
|
|
- struct rq *rq = rq_of(cfs_rq);
|
|
-
|
|
- /*
|
|
- * With cfs_rq being unthrottled/throttled during an enqueue,
|
|
- * it can happen the tmp_alone_branch points the a leaf that
|
|
- * we finally want to del. In this case, tmp_alone_branch moves
|
|
- * to the prev element but it will point to rq->leaf_cfs_rq_list
|
|
- * at the end of the enqueue.
|
|
- */
|
|
- if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
|
|
- rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
|
|
-
|
|
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
|
|
- cfs_rq->on_list = 0;
|
|
- }
|
|
-}
|
|
-
|
|
-static inline void assert_list_leaf_cfs_rq(struct rq *rq)
|
|
-{
|
|
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
|
|
-}
|
|
-
|
|
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
|
|
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
|
|
- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
|
|
- leaf_cfs_rq_list)
|
|
-
|
|
-/* Do the two (enqueued) entities belong to the same group ? */
|
|
-static inline struct cfs_rq *
|
|
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
|
|
-{
|
|
- if (se->cfs_rq == pse->cfs_rq)
|
|
- return se->cfs_rq;
|
|
-
|
|
- return NULL;
|
|
-}
|
|
-
|
|
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
|
|
-{
|
|
- return se->parent;
|
|
-}
|
|
-
|
|
-static void
|
|
-find_matching_se(struct sched_entity **se, struct sched_entity **pse)
|
|
-{
|
|
- int se_depth, pse_depth;
|
|
-
|
|
- /*
|
|
- * preemption test can be made between sibling entities who are in the
|
|
- * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
|
|
- * both tasks until we find their ancestors who are siblings of common
|
|
- * parent.
|
|
- */
|
|
-
|
|
- /* First walk up until both entities are at same depth */
|
|
- se_depth = (*se)->depth;
|
|
- pse_depth = (*pse)->depth;
|
|
-
|
|
- while (se_depth > pse_depth) {
|
|
- se_depth--;
|
|
- *se = parent_entity(*se);
|
|
- }
|
|
-
|
|
- while (pse_depth > se_depth) {
|
|
- pse_depth--;
|
|
- *pse = parent_entity(*pse);
|
|
- }
|
|
-
|
|
- while (!is_same_group(*se, *pse)) {
|
|
- *se = parent_entity(*se);
|
|
- *pse = parent_entity(*pse);
|
|
- }
|
|
-}
|
|
-
|
|
-#else /* !CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
static inline struct task_struct *task_of(struct sched_entity *se)
|
|
{
|
|
@@ -506,138 +322,67 @@
|
|
{
|
|
}
|
|
|
|
-#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
static __always_inline
|
|
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
|
|
|
|
-/**************************************************************
|
|
- * Scheduling class tree data structure manipulation methods:
|
|
+/*
|
|
+ * Enqueue an entity
|
|
*/
|
|
-
|
|
-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
|
|
+static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- s64 delta = (s64)(vruntime - max_vruntime);
|
|
- if (delta > 0)
|
|
- max_vruntime = vruntime;
|
|
+ se->next[DIR_RIGHT] = NULL;
|
|
+ se->next[DIR_LEFT] = NULL;
|
|
|
|
- return max_vruntime;
|
|
-}
|
|
+ if (likely(cfs_rq->head))
|
|
+ {
|
|
+ se->next[DIR_RIGHT] = cfs_rq->head;
|
|
+ cfs_rq->head->next[DIR_LEFT] = se;
|
|
|
|
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
|
|
-{
|
|
- s64 delta = (s64)(vruntime - min_vruntime);
|
|
- if (delta < 0)
|
|
- min_vruntime = vruntime;
|
|
+ // lastly reset the head
|
|
+ cfs_rq->head = se;
|
|
|
|
- return min_vruntime;
|
|
-}
|
|
+ return;
|
|
+ }
|
|
|
|
-static inline int entity_before(struct sched_entity *a,
|
|
- struct sched_entity *b)
|
|
-{
|
|
- return (s64)(a->vruntime - b->vruntime) < 0;
|
|
+ // if empty rq
|
|
+ cfs_rq->head = se;
|
|
}
|
|
|
|
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
|
|
+static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
- struct sched_entity *curr = cfs_rq->curr;
|
|
- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
|
|
-
|
|
- u64 vruntime = cfs_rq->min_vruntime;
|
|
|
|
- if (curr) {
|
|
- if (curr->on_rq)
|
|
- vruntime = curr->vruntime;
|
|
- else
|
|
- curr = NULL;
|
|
+ // if only one se in rq
|
|
+ if (unlikely(cfs_rq->head->next[DIR_RIGHT] == NULL))
|
|
+ cfs_rq->head = NULL;
|
|
+ else if (unlikely(se == cfs_rq->head))
|
|
+ {
|
|
+ // if it is the head
|
|
+ cfs_rq->head = cfs_rq->head->next[DIR_RIGHT];
|
|
+ cfs_rq->head->next[DIR_LEFT] = NULL;
|
|
}
|
|
-
|
|
- if (leftmost) { /* non-empty tree */
|
|
- struct sched_entity *se;
|
|
- se = rb_entry(leftmost, struct sched_entity, run_node);
|
|
-
|
|
- if (!curr)
|
|
- vruntime = se->vruntime;
|
|
else
|
|
- vruntime = min_vruntime(vruntime, se->vruntime);
|
|
- }
|
|
-
|
|
- /* ensure we never gain time by being placed backwards. */
|
|
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
|
|
-#ifndef CONFIG_64BIT
|
|
- smp_wmb();
|
|
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
|
|
-#endif
|
|
-}
|
|
+ {
|
|
+ // if in the middle
|
|
+ struct sched_entity *prev = se->next[DIR_LEFT];
|
|
+ struct sched_entity *next = se->next[DIR_RIGHT];
|
|
|
|
-/*
|
|
- * Enqueue an entity into the rb-tree:
|
|
- */
|
|
-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
|
|
- struct rb_node *parent = NULL;
|
|
- struct sched_entity *entry;
|
|
- bool leftmost = true;
|
|
+ prev->next[DIR_RIGHT] = next;
|
|
|
|
- /*
|
|
- * Find the right place in the rbtree:
|
|
- */
|
|
- while (*link) {
|
|
- parent = *link;
|
|
- entry = rb_entry(parent, struct sched_entity, run_node);
|
|
- /*
|
|
- * We dont care about collisions. Nodes with
|
|
- * the same key stay together.
|
|
- */
|
|
- if (entity_before(se, entry)) {
|
|
- link = &parent->rb_left;
|
|
- } else {
|
|
- link = &parent->rb_right;
|
|
- leftmost = false;
|
|
+ if (next)
|
|
+ next->next[DIR_LEFT] = prev;
|
|
}
|
|
- }
|
|
-
|
|
- rb_link_node(&se->run_node, parent, link);
|
|
- rb_insert_color_cached(&se->run_node,
|
|
- &cfs_rq->tasks_timeline, leftmost);
|
|
-}
|
|
-
|
|
-static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
|
|
}
|
|
|
|
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
|
|
{
|
|
- struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
|
|
-
|
|
- if (!left)
|
|
- return NULL;
|
|
-
|
|
- return rb_entry(left, struct sched_entity, run_node);
|
|
-}
|
|
-
|
|
-static struct sched_entity *__pick_next_entity(struct sched_entity *se)
|
|
-{
|
|
- struct rb_node *next = rb_next(&se->run_node);
|
|
-
|
|
- if (!next)
|
|
- return NULL;
|
|
-
|
|
- return rb_entry(next, struct sched_entity, run_node);
|
|
+ return cfs_rq->head;
|
|
}
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
|
|
{
|
|
- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
|
|
-
|
|
- if (!last)
|
|
- return NULL;
|
|
-
|
|
- return rb_entry(last, struct sched_entity, run_node);
|
|
+ return cfs_rq->head;
|
|
}
|
|
|
|
/**************************************************************
|
|
@@ -723,16 +468,6 @@
|
|
return slice;
|
|
}
|
|
|
|
-/*
|
|
- * We calculate the vruntime slice of a to-be-inserted task.
|
|
- *
|
|
- * vs = s/w
|
|
- */
|
|
-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
- return calc_delta_fair(sched_slice(cfs_rq, se), se);
|
|
-}
|
|
-
|
|
#include "pelt.h"
|
|
#ifdef CONFIG_SMP
|
|
|
|
@@ -856,6 +591,7 @@
|
|
return;
|
|
|
|
curr->exec_start = now;
|
|
+ curr->quantom++;
|
|
|
|
schedstat_set(curr->statistics.exec_max,
|
|
max(delta_exec, curr->statistics.exec_max));
|
|
@@ -864,12 +600,10 @@
|
|
schedstat_add(cfs_rq->exec_clock, delta_exec);
|
|
|
|
curr->vruntime += calc_delta_fair(delta_exec, curr);
|
|
- update_min_vruntime(cfs_rq);
|
|
|
|
if (entity_is_task(curr)) {
|
|
struct task_struct *curtask = task_of(curr);
|
|
|
|
- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
|
|
cgroup_account_cputime(curtask, delta_exec);
|
|
account_group_exec_runtime(curtask, delta_exec);
|
|
}
|
|
@@ -2897,39 +2631,6 @@
|
|
}
|
|
}
|
|
|
|
-/*
|
|
- * Drive the periodic memory faults..
|
|
- */
|
|
-static void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
-{
|
|
- struct callback_head *work = &curr->numa_work;
|
|
- u64 period, now;
|
|
-
|
|
- /*
|
|
- * We don't care about NUMA placement if we don't have memory.
|
|
- */
|
|
- if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
|
|
- return;
|
|
-
|
|
- /*
|
|
- * Using runtime rather than walltime has the dual advantage that
|
|
- * we (mostly) drive the selection from busy threads and that the
|
|
- * task needs to have done some actual work before we bother with
|
|
- * NUMA placement.
|
|
- */
|
|
- now = curr->se.sum_exec_runtime;
|
|
- period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
|
|
-
|
|
- if (now > curr->node_stamp + period) {
|
|
- if (!curr->node_stamp)
|
|
- curr->numa_scan_period = task_scan_start(curr);
|
|
- curr->node_stamp += period;
|
|
-
|
|
- if (!time_before(jiffies, curr->mm->numa_next_scan))
|
|
- task_work_add(curr, work, true);
|
|
- }
|
|
-}
|
|
-
|
|
static void update_scan_period(struct task_struct *p, int new_cpu)
|
|
{
|
|
int src_nid = cpu_to_node(task_cpu(p));
|
|
@@ -2965,9 +2666,6 @@
|
|
}
|
|
|
|
#else
|
|
-static void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
-{
|
|
-}
|
|
|
|
static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
|
|
{
|
|
@@ -4072,50 +3770,9 @@
|
|
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
{
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
- s64 d = se->vruntime - cfs_rq->min_vruntime;
|
|
-
|
|
- if (d < 0)
|
|
- d = -d;
|
|
-
|
|
- if (d > 3*sysctl_sched_latency)
|
|
- schedstat_inc(cfs_rq->nr_spread_over);
|
|
#endif
|
|
}
|
|
|
|
-static void
|
|
-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
-{
|
|
- u64 vruntime = cfs_rq->min_vruntime;
|
|
-
|
|
- /*
|
|
- * The 'current' period is already promised to the current tasks,
|
|
- * however the extra weight of the new task will slow them down a
|
|
- * little, place the new task so that it fits in the slot that
|
|
- * stays open at the end.
|
|
- */
|
|
- if (initial && sched_feat(START_DEBIT))
|
|
- vruntime += sched_vslice(cfs_rq, se);
|
|
-
|
|
- /* sleeps up to a single latency don't count. */
|
|
- if (!initial) {
|
|
- unsigned long thresh = sysctl_sched_latency;
|
|
-
|
|
- /*
|
|
- * Halve their sleep time's effect, to allow
|
|
- * for a gentler effect of sleepers:
|
|
- */
|
|
- if (sched_feat(GENTLE_FAIR_SLEEPERS))
|
|
- thresh >>= 1;
|
|
-
|
|
- vruntime -= thresh;
|
|
- }
|
|
-
|
|
- /* ensure we never gain time by being placed backwards. */
|
|
- se->vruntime = max_vruntime(se->vruntime, vruntime);
|
|
-}
|
|
-
|
|
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
|
-
|
|
static inline void check_schedstat_required(void)
|
|
{
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
@@ -4171,28 +3828,11 @@
|
|
static void
|
|
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
{
|
|
- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
|
|
bool curr = cfs_rq->curr == se;
|
|
|
|
- /*
|
|
- * If we're the current task, we must renormalise before calling
|
|
- * update_curr().
|
|
- */
|
|
- if (renorm && curr)
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
-
|
|
update_curr(cfs_rq);
|
|
|
|
/*
|
|
- * Otherwise, renormalise after, such that we're placed at the current
|
|
- * moment in time, instead of some random moment in the past. Being
|
|
- * placed in the past could significantly boost this task to the
|
|
- * fairness detriment of existing tasks.
|
|
- */
|
|
- if (renorm && !curr)
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
-
|
|
- /*
|
|
* When enqueuing a sched_entity, we must:
|
|
* - Update loads to have both entity and cfs_rq synced with now.
|
|
* - Add its load to cfs_rq->runnable_avg
|
|
@@ -4205,71 +3845,12 @@
|
|
update_cfs_group(se);
|
|
account_entity_enqueue(cfs_rq, se);
|
|
|
|
- if (flags & ENQUEUE_WAKEUP)
|
|
- place_entity(cfs_rq, se, 0);
|
|
-
|
|
check_schedstat_required();
|
|
update_stats_enqueue(cfs_rq, se, flags);
|
|
check_spread(cfs_rq, se);
|
|
if (!curr)
|
|
__enqueue_entity(cfs_rq, se);
|
|
se->on_rq = 1;
|
|
-
|
|
- /*
|
|
- * When bandwidth control is enabled, cfs might have been removed
|
|
- * because of a parent been throttled but cfs->nr_running > 1. Try to
|
|
- * add it unconditionnally.
|
|
- */
|
|
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
|
|
- list_add_leaf_cfs_rq(cfs_rq);
|
|
-
|
|
- if (cfs_rq->nr_running == 1)
|
|
- check_enqueue_throttle(cfs_rq);
|
|
-}
|
|
-
|
|
-static void __clear_buddies_last(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- if (cfs_rq->last != se)
|
|
- break;
|
|
-
|
|
- cfs_rq->last = NULL;
|
|
- }
|
|
-}
|
|
-
|
|
-static void __clear_buddies_next(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- if (cfs_rq->next != se)
|
|
- break;
|
|
-
|
|
- cfs_rq->next = NULL;
|
|
- }
|
|
-}
|
|
-
|
|
-static void __clear_buddies_skip(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se) {
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- if (cfs_rq->skip != se)
|
|
- break;
|
|
-
|
|
- cfs_rq->skip = NULL;
|
|
- }
|
|
-}
|
|
-
|
|
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
-{
|
|
- if (cfs_rq->last == se)
|
|
- __clear_buddies_last(se);
|
|
-
|
|
- if (cfs_rq->next == se)
|
|
- __clear_buddies_next(se);
|
|
-
|
|
- if (cfs_rq->skip == se)
|
|
- __clear_buddies_skip(se);
|
|
}
|
|
|
|
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
@@ -4295,75 +3876,15 @@
|
|
|
|
update_stats_dequeue(cfs_rq, se, flags);
|
|
|
|
- clear_buddies(cfs_rq, se);
|
|
-
|
|
- if (se != cfs_rq->curr)
|
|
__dequeue_entity(cfs_rq, se);
|
|
+
|
|
se->on_rq = 0;
|
|
account_entity_dequeue(cfs_rq, se);
|
|
|
|
- /*
|
|
- * Normalize after update_curr(); which will also have moved
|
|
- * min_vruntime if @se is the one holding it back. But before doing
|
|
- * update_min_vruntime() again, which will discount @se's position and
|
|
- * can move min_vruntime forward still more.
|
|
- */
|
|
- if (!(flags & DEQUEUE_SLEEP))
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
-
|
|
/* return excess runtime on last dequeue */
|
|
return_cfs_rq_runtime(cfs_rq);
|
|
|
|
update_cfs_group(se);
|
|
-
|
|
- /*
|
|
- * Now advance min_vruntime if @se was the entity holding it back,
|
|
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
|
|
- * put back on, and if we advance min_vruntime, we'll be placed back
|
|
- * further than we started -- ie. we'll be penalized.
|
|
- */
|
|
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
|
|
- update_min_vruntime(cfs_rq);
|
|
-}
|
|
-
|
|
-/*
|
|
- * Preempt the current task with a newly woken task if needed:
|
|
- */
|
|
-static void
|
|
-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
-{
|
|
- unsigned long ideal_runtime, delta_exec;
|
|
- struct sched_entity *se;
|
|
- s64 delta;
|
|
-
|
|
- ideal_runtime = sched_slice(cfs_rq, curr);
|
|
- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
|
|
- if (delta_exec > ideal_runtime) {
|
|
- resched_curr(rq_of(cfs_rq));
|
|
- /*
|
|
- * The current task ran long enough, ensure it doesn't get
|
|
- * re-elected due to buddy favours.
|
|
- */
|
|
- clear_buddies(cfs_rq, curr);
|
|
- return;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Ensure that a task that missed wakeup preemption by a
|
|
- * narrow margin doesn't have to wait for a full slice.
|
|
- * This also mitigates buddy induced latencies under load.
|
|
- */
|
|
- if (delta_exec < sysctl_sched_min_granularity)
|
|
- return;
|
|
-
|
|
- se = __pick_first_entity(cfs_rq);
|
|
- delta = curr->vruntime - se->vruntime;
|
|
-
|
|
- if (delta < 0)
|
|
- return;
|
|
-
|
|
- if (delta > ideal_runtime)
|
|
- resched_curr(rq_of(cfs_rq));
|
|
}
|
|
|
|
static void
|
|
@@ -4371,96 +3892,18 @@
|
|
{
|
|
/* 'current' is not kept within the tree. */
|
|
if (se->on_rq) {
|
|
- /*
|
|
- * Any task has to be enqueued before it get to execute on
|
|
- * a CPU. So account for the time it spent waiting on the
|
|
- * runqueue.
|
|
- */
|
|
update_stats_wait_end(cfs_rq, se);
|
|
- __dequeue_entity(cfs_rq, se);
|
|
update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
}
|
|
|
|
update_stats_curr_start(cfs_rq, se);
|
|
cfs_rq->curr = se;
|
|
|
|
- /*
|
|
- * Track our maximum slice length, if the CPU's load is at
|
|
- * least twice that of our own weight (i.e. dont track it
|
|
- * when there are only lesser-weight tasks around):
|
|
- */
|
|
- if (schedstat_enabled() &&
|
|
- rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
|
|
- schedstat_set(se->statistics.slice_max,
|
|
- max((u64)schedstat_val(se->statistics.slice_max),
|
|
- se->sum_exec_runtime - se->prev_sum_exec_runtime));
|
|
- }
|
|
-
|
|
se->prev_sum_exec_runtime = se->sum_exec_runtime;
|
|
}
|
|
|
|
static int
|
|
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
-
|
|
-/*
|
|
- * Pick the next process, keeping these things in mind, in this order:
|
|
- * 1) keep things fair between processes/task groups
|
|
- * 2) pick the "next" process, since someone really wants that to run
|
|
- * 3) pick the "last" process, for cache locality
|
|
- * 4) do not run the "skip" process, if something else is available
|
|
- */
|
|
-static struct sched_entity *
|
|
-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
-{
|
|
- struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
- struct sched_entity *se;
|
|
-
|
|
- /*
|
|
- * If curr is set we have to see if its left of the leftmost entity
|
|
- * still in the tree, provided there was anything in the tree at all.
|
|
- */
|
|
- if (!left || (curr && entity_before(curr, left)))
|
|
- left = curr;
|
|
-
|
|
- se = left; /* ideally we run the leftmost entity */
|
|
-
|
|
- /*
|
|
- * Avoid running the skip buddy, if running something else can
|
|
- * be done without getting too unfair.
|
|
- */
|
|
- if (cfs_rq->skip == se) {
|
|
- struct sched_entity *second;
|
|
-
|
|
- if (se == curr) {
|
|
- second = __pick_first_entity(cfs_rq);
|
|
- } else {
|
|
- second = __pick_next_entity(se);
|
|
- if (!second || (curr && entity_before(curr, second)))
|
|
- second = curr;
|
|
- }
|
|
-
|
|
- if (second && wakeup_preempt_entity(second, left) < 1)
|
|
- se = second;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Prefer last buddy, try to return the CPU to a preempted task.
|
|
- */
|
|
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
|
|
- se = cfs_rq->last;
|
|
-
|
|
- /*
|
|
- * Someone really wants this to run. If it's not unfair, run it.
|
|
- */
|
|
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
|
|
- se = cfs_rq->next;
|
|
-
|
|
- clear_buddies(cfs_rq, se);
|
|
-
|
|
- return se;
|
|
-}
|
|
-
|
|
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
+wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se);
|
|
|
|
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
|
{
|
|
@@ -4471,21 +3914,19 @@
|
|
if (prev->on_rq)
|
|
update_curr(cfs_rq);
|
|
|
|
- /* throttle cfs_rqs exceeding runtime */
|
|
- check_cfs_rq_runtime(cfs_rq);
|
|
-
|
|
- check_spread(cfs_rq, prev);
|
|
-
|
|
if (prev->on_rq) {
|
|
update_stats_wait_start(cfs_rq, prev);
|
|
- /* Put 'current' back into the tree. */
|
|
- __enqueue_entity(cfs_rq, prev);
|
|
/* in !on_rq case, update occurred at dequeue */
|
|
update_load_avg(cfs_rq, prev, 0);
|
|
}
|
|
cfs_rq->curr = NULL;
|
|
}
|
|
|
|
+static int check_preempt_curr_fair(struct sched_entity *curr)
|
|
+{
|
|
+ return 1;
|
|
+}
|
|
+
|
|
static void
|
|
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
|
|
{
|
|
@@ -4509,6 +3950,12 @@
|
|
resched_curr(rq_of(cfs_rq));
|
|
return;
|
|
}
|
|
+
|
|
+ if (check_preempt_curr_fair(curr) == 1) {
|
|
+ resched_curr(rq_of(cfs_rq));
|
|
+ return;
|
|
+ }
|
|
+
|
|
/*
|
|
* don't let the period tick interfere with the hrtick preemption
|
|
*/
|
|
@@ -4516,9 +3963,6 @@
|
|
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
|
|
return;
|
|
#endif
|
|
-
|
|
- if (cfs_rq->nr_running > 1)
|
|
- check_preempt_tick(cfs_rq, curr);
|
|
}
|
|
|
|
|
|
@@ -5082,30 +4526,6 @@
|
|
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
|
}
|
|
|
|
-/*
|
|
- * When a group wakes up we want to make sure that its quota is not already
|
|
- * expired/exceeded, otherwise it may be allowed to steal additional ticks of
|
|
- * runtime as update_curr() throttling can not not trigger until it's on-rq.
|
|
- */
|
|
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
|
|
-{
|
|
- if (!cfs_bandwidth_used())
|
|
- return;
|
|
-
|
|
- /* an active group must be handled by the update_curr()->put() path */
|
|
- if (!cfs_rq->runtime_enabled || cfs_rq->curr)
|
|
- return;
|
|
-
|
|
- /* ensure the group is not already throttled */
|
|
- if (cfs_rq_throttled(cfs_rq))
|
|
- return;
|
|
-
|
|
- /* update runtime allocation */
|
|
- account_cfs_rq_runtime(cfs_rq, 0);
|
|
- if (cfs_rq->runtime_remaining <= 0)
|
|
- throttle_cfs_rq(cfs_rq);
|
|
-}
|
|
-
|
|
static void sync_throttle(struct task_group *tg, int cpu)
|
|
{
|
|
struct cfs_rq *pcfs_rq, *cfs_rq;
|
|
@@ -5123,26 +4543,6 @@
|
|
cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
|
|
}
|
|
|
|
-/* conditionally throttle active cfs_rq's from put_prev_entity() */
|
|
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|
-{
|
|
- if (!cfs_bandwidth_used())
|
|
- return false;
|
|
-
|
|
- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
|
|
- return false;
|
|
-
|
|
- /*
|
|
- * it's possible for a throttled entity to be forced into a running
|
|
- * state (e.g. set_curr_task), in this case we're finished.
|
|
- */
|
|
- if (cfs_rq_throttled(cfs_rq))
|
|
- return true;
|
|
-
|
|
- throttle_cfs_rq(cfs_rq);
|
|
- return true;
|
|
-}
|
|
-
|
|
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
|
|
{
|
|
struct cfs_bandwidth *cfs_b =
|
|
@@ -5318,8 +4718,6 @@
|
|
}
|
|
|
|
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
|
|
-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
|
|
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
|
|
static inline void sync_throttle(struct task_group *tg, int cpu) {}
|
|
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
|
|
|
@@ -5548,8 +4946,6 @@
|
|
hrtick_update(rq);
|
|
}
|
|
|
|
-static void set_next_buddy(struct sched_entity *se);
|
|
-
|
|
/*
|
|
* The dequeue_task method is called before nr_running is
|
|
* decreased. We remove the task from the rbtree and
|
|
@@ -5578,12 +4974,6 @@
|
|
if (cfs_rq->load.weight) {
|
|
/* Avoid re-evaluating load for this entity: */
|
|
se = parent_entity(se);
|
|
- /*
|
|
- * Bias pick_next to pick a task from this cfs_rq, as
|
|
- * p is sleeping when it is within its sched_slice.
|
|
- */
|
|
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
|
|
- set_next_buddy(se);
|
|
break;
|
|
}
|
|
flags |= DEQUEUE_SLEEP;
|
|
@@ -5699,53 +5089,6 @@
|
|
return cpu_rq(cpu)->cpu_capacity;
|
|
}
|
|
|
|
-static void record_wakee(struct task_struct *p)
|
|
-{
|
|
- /*
|
|
- * Only decay a single time; tasks that have less then 1 wakeup per
|
|
- * jiffy will not have built up many flips.
|
|
- */
|
|
- if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
|
|
- current->wakee_flips >>= 1;
|
|
- current->wakee_flip_decay_ts = jiffies;
|
|
- }
|
|
-
|
|
- if (current->last_wakee != p) {
|
|
- current->last_wakee = p;
|
|
- current->wakee_flips++;
|
|
- }
|
|
-}
|
|
-
|
|
-/*
|
|
- * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
|
|
- *
|
|
- * A waker of many should wake a different task than the one last awakened
|
|
- * at a frequency roughly N times higher than one of its wakees.
|
|
- *
|
|
- * In order to determine whether we should let the load spread vs consolidating
|
|
- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
|
|
- * partner, and a factor of lls_size higher frequency in the other.
|
|
- *
|
|
- * With both conditions met, we can be relatively sure that the relationship is
|
|
- * non-monogamous, with partner count exceeding socket size.
|
|
- *
|
|
- * Waker/wakee being client/server, worker/dispatcher, interrupt source or
|
|
- * whatever is irrelevant, spread criteria is apparent partner count exceeds
|
|
- * socket size.
|
|
- */
|
|
-static int wake_wide(struct task_struct *p)
|
|
-{
|
|
- unsigned int master = current->wakee_flips;
|
|
- unsigned int slave = p->wakee_flips;
|
|
- int factor = this_cpu_read(sd_llc_size);
|
|
-
|
|
- if (master < slave)
|
|
- swap(master, slave);
|
|
- if (slave < factor || master < slave * factor)
|
|
- return 0;
|
|
- return 1;
|
|
-}
|
|
-
|
|
/*
|
|
* The purpose of wake_affine() is to quickly determine on which CPU we can run
|
|
* soonest. For the purpose of speed we only consider the waking and previous
|
|
@@ -6402,238 +5745,6 @@
|
|
return min_t(unsigned long, util, capacity_orig_of(cpu));
|
|
}
|
|
|
|
-/*
|
|
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
|
|
- * to @dst_cpu.
|
|
- */
|
|
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
|
|
-{
|
|
- struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
|
|
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
|
|
-
|
|
- /*
|
|
- * If @p migrates from @cpu to another, remove its contribution. Or,
|
|
- * if @p migrates from another CPU to @cpu, add its contribution. In
|
|
- * the other cases, @cpu is not impacted by the migration, so the
|
|
- * util_avg should already be correct.
|
|
- */
|
|
- if (task_cpu(p) == cpu && dst_cpu != cpu)
|
|
- sub_positive(&util, task_util(p));
|
|
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
|
|
- util += task_util(p);
|
|
-
|
|
- if (sched_feat(UTIL_EST)) {
|
|
- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
|
|
-
|
|
- /*
|
|
- * During wake-up, the task isn't enqueued yet and doesn't
|
|
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
|
|
- * so just add it (if needed) to "simulate" what will be
|
|
- * cpu_util() after the task has been enqueued.
|
|
- */
|
|
- if (dst_cpu == cpu)
|
|
- util_est += _task_util_est(p);
|
|
-
|
|
- util = max(util, util_est);
|
|
- }
|
|
-
|
|
- return min(util, capacity_orig_of(cpu));
|
|
-}
|
|
-
|
|
-/*
|
|
- * compute_energy(): Estimates the energy that @pd would consume if @p was
|
|
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
|
|
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
|
|
- * to compute what would be the energy if we decided to actually migrate that
|
|
- * task.
|
|
- */
|
|
-static long
|
|
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
|
|
-{
|
|
- struct cpumask *pd_mask = perf_domain_span(pd);
|
|
- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
|
|
- unsigned long max_util = 0, sum_util = 0;
|
|
- int cpu;
|
|
-
|
|
- /*
|
|
- * The capacity state of CPUs of the current rd can be driven by CPUs
|
|
- * of another rd if they belong to the same pd. So, account for the
|
|
- * utilization of these CPUs too by masking pd with cpu_online_mask
|
|
- * instead of the rd span.
|
|
- *
|
|
- * If an entire pd is outside of the current rd, it will not appear in
|
|
- * its pd list and will not be accounted by compute_energy().
|
|
- */
|
|
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
|
|
- unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
|
|
- struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
|
|
-
|
|
- /*
|
|
- * Busy time computation: utilization clamping is not
|
|
- * required since the ratio (sum_util / cpu_capacity)
|
|
- * is already enough to scale the EM reported power
|
|
- * consumption at the (eventually clamped) cpu_capacity.
|
|
- */
|
|
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
|
|
- ENERGY_UTIL, NULL);
|
|
-
|
|
- /*
|
|
- * Performance domain frequency: utilization clamping
|
|
- * must be considered since it affects the selection
|
|
- * of the performance domain frequency.
|
|
- * NOTE: in case RT tasks are running, by default the
|
|
- * FREQUENCY_UTIL's utilization can be max OPP.
|
|
- */
|
|
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
|
|
- FREQUENCY_UTIL, tsk);
|
|
- max_util = max(max_util, cpu_util);
|
|
- }
|
|
-
|
|
- return em_pd_energy(pd->em_pd, max_util, sum_util);
|
|
-}
|
|
-
|
|
-/*
|
|
- * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
|
|
- * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
|
|
- * spare capacity in each performance domain and uses it as a potential
|
|
- * candidate to execute the task. Then, it uses the Energy Model to figure
|
|
- * out which of the CPU candidates is the most energy-efficient.
|
|
- *
|
|
- * The rationale for this heuristic is as follows. In a performance domain,
|
|
- * all the most energy efficient CPU candidates (according to the Energy
|
|
- * Model) are those for which we'll request a low frequency. When there are
|
|
- * several CPUs for which the frequency request will be the same, we don't
|
|
- * have enough data to break the tie between them, because the Energy Model
|
|
- * only includes active power costs. With this model, if we assume that
|
|
- * frequency requests follow utilization (e.g. using schedutil), the CPU with
|
|
- * the maximum spare capacity in a performance domain is guaranteed to be among
|
|
- * the best candidates of the performance domain.
|
|
- *
|
|
- * In practice, it could be preferable from an energy standpoint to pack
|
|
- * small tasks on a CPU in order to let other CPUs go in deeper idle states,
|
|
- * but that could also hurt our chances to go cluster idle, and we have no
|
|
- * ways to tell with the current Energy Model if this is actually a good
|
|
- * idea or not. So, find_energy_efficient_cpu() basically favors
|
|
- * cluster-packing, and spreading inside a cluster. That should at least be
|
|
- * a good thing for latency, and this is consistent with the idea that most
|
|
- * of the energy savings of EAS come from the asymmetry of the system, and
|
|
- * not so much from breaking the tie between identical CPUs. That's also the
|
|
- * reason why EAS is enabled in the topology code only for systems where
|
|
- * SD_ASYM_CPUCAPACITY is set.
|
|
- *
|
|
- * NOTE: Forkees are not accepted in the energy-aware wake-up path because
|
|
- * they don't have any useful utilization data yet and it's not possible to
|
|
- * forecast their impact on energy consumption. Consequently, they will be
|
|
- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
|
|
- * to be energy-inefficient in some use-cases. The alternative would be to
|
|
- * bias new tasks towards specific types of CPUs first, or to try to infer
|
|
- * their util_avg from the parent task, but those heuristics could hurt
|
|
- * other use-cases too. So, until someone finds a better way to solve this,
|
|
- * let's keep things simple by re-using the existing slow path.
|
|
- */
|
|
-static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
|
|
-{
|
|
- unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
|
|
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
|
|
- unsigned long cpu_cap, util, base_energy = 0;
|
|
- int cpu, best_energy_cpu = prev_cpu;
|
|
- struct sched_domain *sd;
|
|
- struct perf_domain *pd;
|
|
-
|
|
- rcu_read_lock();
|
|
- pd = rcu_dereference(rd->pd);
|
|
- if (!pd || READ_ONCE(rd->overutilized))
|
|
- goto fail;
|
|
-
|
|
- /*
|
|
- * Energy-aware wake-up happens on the lowest sched_domain starting
|
|
- * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
|
|
- */
|
|
- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
|
|
- while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
|
|
- sd = sd->parent;
|
|
- if (!sd)
|
|
- goto fail;
|
|
-
|
|
- sync_entity_load_avg(&p->se);
|
|
- if (!task_util_est(p))
|
|
- goto unlock;
|
|
-
|
|
- for (; pd; pd = pd->next) {
|
|
- unsigned long cur_delta, spare_cap, max_spare_cap = 0;
|
|
- unsigned long base_energy_pd;
|
|
- int max_spare_cap_cpu = -1;
|
|
-
|
|
- /* Compute the 'base' energy of the pd, without @p */
|
|
- base_energy_pd = compute_energy(p, -1, pd);
|
|
- base_energy += base_energy_pd;
|
|
-
|
|
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
|
|
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
|
|
- continue;
|
|
-
|
|
- util = cpu_util_next(cpu, p, cpu);
|
|
- cpu_cap = capacity_of(cpu);
|
|
- spare_cap = cpu_cap - util;
|
|
-
|
|
- /*
|
|
- * Skip CPUs that cannot satisfy the capacity request.
|
|
- * IOW, placing the task there would make the CPU
|
|
- * overutilized. Take uclamp into account to see how
|
|
- * much capacity we can get out of the CPU; this is
|
|
- * aligned with schedutil_cpu_util().
|
|
- */
|
|
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
|
|
- if (!fits_capacity(util, cpu_cap))
|
|
- continue;
|
|
-
|
|
- /* Always use prev_cpu as a candidate. */
|
|
- if (cpu == prev_cpu) {
|
|
- prev_delta = compute_energy(p, prev_cpu, pd);
|
|
- prev_delta -= base_energy_pd;
|
|
- best_delta = min(best_delta, prev_delta);
|
|
- }
|
|
-
|
|
- /*
|
|
- * Find the CPU with the maximum spare capacity in
|
|
- * the performance domain
|
|
- */
|
|
- if (spare_cap > max_spare_cap) {
|
|
- max_spare_cap = spare_cap;
|
|
- max_spare_cap_cpu = cpu;
|
|
- }
|
|
- }
|
|
-
|
|
- /* Evaluate the energy impact of using this CPU. */
|
|
- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
|
|
- cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
|
|
- cur_delta -= base_energy_pd;
|
|
- if (cur_delta < best_delta) {
|
|
- best_delta = cur_delta;
|
|
- best_energy_cpu = max_spare_cap_cpu;
|
|
- }
|
|
- }
|
|
- }
|
|
-unlock:
|
|
- rcu_read_unlock();
|
|
-
|
|
- /*
|
|
- * Pick the best CPU if prev_cpu cannot be used, or if it saves at
|
|
- * least 6% of the energy used by prev_cpu.
|
|
- */
|
|
- if (prev_delta == ULONG_MAX)
|
|
- return best_energy_cpu;
|
|
-
|
|
- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
|
|
- return best_energy_cpu;
|
|
-
|
|
- return prev_cpu;
|
|
-
|
|
-fail:
|
|
- rcu_read_unlock();
|
|
-
|
|
- return -1;
|
|
-}
|
|
|
|
/*
|
|
* select_task_rq_fair: Select target runqueue for the waking task in domains
|
|
@@ -6656,19 +5767,6 @@
|
|
int want_affine = 0;
|
|
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
|
|
|
|
- if (sd_flag & SD_BALANCE_WAKE) {
|
|
- record_wakee(p);
|
|
-
|
|
- if (sched_energy_enabled()) {
|
|
- new_cpu = find_energy_efficient_cpu(p, prev_cpu);
|
|
- if (new_cpu >= 0)
|
|
- return new_cpu;
|
|
- new_cpu = prev_cpu;
|
|
- }
|
|
-
|
|
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
|
|
- }
|
|
-
|
|
rcu_read_lock();
|
|
for_each_domain(cpu, tmp) {
|
|
if (!(tmp->flags & SD_LOAD_BALANCE))
|
|
@@ -6696,7 +5794,9 @@
|
|
if (unlikely(sd)) {
|
|
/* Slow path */
|
|
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
|
- } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
|
|
+ }
|
|
+
|
|
+ else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
|
|
/* Fast path */
|
|
|
|
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
|
@@ -6718,59 +5818,6 @@
|
|
*/
|
|
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
|
|
{
|
|
- /*
|
|
- * As blocked tasks retain absolute vruntime the migration needs to
|
|
- * deal with this by subtracting the old and adding the new
|
|
- * min_vruntime -- the latter is done by enqueue_entity() when placing
|
|
- * the task on the new runqueue.
|
|
- */
|
|
- if (p->state == TASK_WAKING) {
|
|
- struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- u64 min_vruntime;
|
|
-
|
|
-#ifndef CONFIG_64BIT
|
|
- u64 min_vruntime_copy;
|
|
-
|
|
- do {
|
|
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
|
|
- smp_rmb();
|
|
- min_vruntime = cfs_rq->min_vruntime;
|
|
- } while (min_vruntime != min_vruntime_copy);
|
|
-#else
|
|
- min_vruntime = cfs_rq->min_vruntime;
|
|
-#endif
|
|
-
|
|
- se->vruntime -= min_vruntime;
|
|
- }
|
|
-
|
|
- if (p->on_rq == TASK_ON_RQ_MIGRATING) {
|
|
- /*
|
|
- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
|
|
- * rq->lock and can modify state directly.
|
|
- */
|
|
- lockdep_assert_held(&task_rq(p)->lock);
|
|
- detach_entity_cfs_rq(&p->se);
|
|
-
|
|
- } else {
|
|
- /*
|
|
- * We are supposed to update the task to "current" time, then
|
|
- * its up to date and ready to go to new CPU/cfs_rq. But we
|
|
- * have difficulty in getting what current time is, so simply
|
|
- * throw away the out-of-date time. This will result in the
|
|
- * wakee task is less decayed, but giving the wakee more load
|
|
- * sounds not bad.
|
|
- */
|
|
- remove_entity_load_avg(&p->se);
|
|
- }
|
|
-
|
|
- /* Tell new CPU we are migrated */
|
|
- p->se.avg.last_update_time = 0;
|
|
-
|
|
- /* We have migrated, no longer consider this task hot */
|
|
- p->se.exec_start = 0;
|
|
-
|
|
- update_scan_period(p, new_cpu);
|
|
}
|
|
|
|
static void task_dead_fair(struct task_struct *p)
|
|
@@ -6781,32 +5828,10 @@
|
|
static int
|
|
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
{
|
|
- if (rq->nr_running)
|
|
return 1;
|
|
-
|
|
- return newidle_balance(rq, rf) != 0;
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
-static unsigned long wakeup_gran(struct sched_entity *se)
|
|
-{
|
|
- unsigned long gran = sysctl_sched_wakeup_granularity;
|
|
-
|
|
- /*
|
|
- * Since its curr running now, convert the gran from real-time
|
|
- * to virtual-time in his units.
|
|
- *
|
|
- * By using 'se' instead of 'curr' we penalize light tasks, so
|
|
- * they get preempted easier. That is, if 'se' < 'curr' then
|
|
- * the resulting gran will be larger, therefore penalizing the
|
|
- * lighter, if otoh 'se' > 'curr' then the resulting gran will
|
|
- * be smaller, again penalizing the lighter task.
|
|
- *
|
|
- * This is especially important for buddies when the leftmost
|
|
- * task is higher priority than the buddy.
|
|
- */
|
|
- return calc_delta_fair(gran, se);
|
|
-}
|
|
|
|
/*
|
|
* Should 'se' preempt 'curr'.
|
|
@@ -6817,54 +5842,43 @@
|
|
* g
|
|
* |<--->|c
|
|
*
|
|
- * w(c, s1) = -1
|
|
+ * w(c, s1) = -1 // don't preempt
|
|
* w(c, s2) = 0
|
|
- * w(c, s3) = 1
|
|
+ * w(c, s3) = 1 // preempt
|
|
*
|
|
*/
|
|
static int
|
|
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
|
|
-{
|
|
- s64 gran, vdiff = curr->vruntime - se->vruntime;
|
|
-
|
|
- if (vdiff <= 0)
|
|
- return -1;
|
|
-
|
|
- gran = wakeup_gran(se);
|
|
- if (vdiff > gran)
|
|
- return 1;
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static void set_last_buddy(struct sched_entity *se)
|
|
+wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se)
|
|
{
|
|
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
|
|
- return;
|
|
+ u64 r_curr, r_se, w_curr, w_se;
|
|
+ struct task_struct *t_curr = task_of(curr);
|
|
+ struct task_struct *t_se = task_of(se);
|
|
+ u64 vr_curr = curr->sum_exec_runtime + 1;
|
|
+ u64 vr_se = se->sum_exec_runtime + 1;
|
|
+ s64 diff;
|
|
+
|
|
+ w_curr = (now - t_curr->start_boottime) - vr_curr;
|
|
+ w_se = (now - t_se->start_boottime) - vr_se;
|
|
+
|
|
+ w_curr *= (140 - t_curr->prio);
|
|
+ w_se *= (140 - t_se->prio);
|
|
+
|
|
+ r_curr = w_curr / vr_curr;
|
|
+ r_se = w_se / vr_se;
|
|
+ diff = (s64)(r_se) - (s64)(r_curr);
|
|
|
|
- for_each_sched_entity(se) {
|
|
- if (SCHED_WARN_ON(!se->on_rq))
|
|
- return;
|
|
- cfs_rq_of(se)->last = se;
|
|
+ if (diff == 0)
|
|
+ {
|
|
+ r_curr = w_curr % vr_curr;
|
|
+ r_se = w_se % vr_se;
|
|
+ diff = (s64)(r_se) - (s64)(r_curr);
|
|
}
|
|
-}
|
|
|
|
-static void set_next_buddy(struct sched_entity *se)
|
|
-{
|
|
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
|
|
- return;
|
|
|
|
- for_each_sched_entity(se) {
|
|
- if (SCHED_WARN_ON(!se->on_rq))
|
|
- return;
|
|
- cfs_rq_of(se)->next = se;
|
|
- }
|
|
-}
|
|
+ if (diff > 0)
|
|
+ return 1;
|
|
|
|
-static void set_skip_buddy(struct sched_entity *se)
|
|
-{
|
|
- for_each_sched_entity(se)
|
|
- cfs_rq_of(se)->skip = se;
|
|
+ return -1;
|
|
}
|
|
|
|
/*
|
|
@@ -6874,28 +5888,12 @@
|
|
{
|
|
struct task_struct *curr = rq->curr;
|
|
struct sched_entity *se = &curr->se, *pse = &p->se;
|
|
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
|
- int scale = cfs_rq->nr_running >= sched_nr_latency;
|
|
- int next_buddy_marked = 0;
|
|
+ u64 now = rq_clock_task(rq);
|
|
|
|
if (unlikely(se == pse))
|
|
return;
|
|
|
|
/*
|
|
- * This is possible from callers such as attach_tasks(), in which we
|
|
- * unconditionally check_prempt_curr() after an enqueue (which may have
|
|
- * lead to a throttle). This both saves work and prevents false
|
|
- * next-buddy nomination below.
|
|
- */
|
|
- if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
|
|
- return;
|
|
-
|
|
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
|
|
- set_next_buddy(pse);
|
|
- next_buddy_marked = 1;
|
|
- }
|
|
-
|
|
- /*
|
|
* We can come here with TIF_NEED_RESCHED already set from new task
|
|
* wake up path.
|
|
*
|
|
@@ -6923,13 +5921,7 @@
|
|
find_matching_se(&se, &pse);
|
|
update_curr(cfs_rq_of(se));
|
|
BUG_ON(!pse);
|
|
- if (wakeup_preempt_entity(se, pse) == 1) {
|
|
- /*
|
|
- * Bias pick_next to pick the sched entity that is
|
|
- * triggering this preemption.
|
|
- */
|
|
- if (!next_buddy_marked)
|
|
- set_next_buddy(pse);
|
|
+ if (wakeup_preempt_entity(now, se, pse) == 1) {
|
|
goto preempt;
|
|
}
|
|
|
|
@@ -6948,113 +5940,36 @@
|
|
*/
|
|
if (unlikely(!se->on_rq || curr == rq->idle))
|
|
return;
|
|
-
|
|
- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
|
|
- set_last_buddy(se);
|
|
}
|
|
|
|
struct task_struct *
|
|
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
|
|
{
|
|
struct cfs_rq *cfs_rq = &rq->cfs;
|
|
- struct sched_entity *se;
|
|
+ struct sched_entity *se, *next;
|
|
struct task_struct *p;
|
|
- int new_tasks;
|
|
+ u64 now = rq_clock_task(rq);
|
|
|
|
-again:
|
|
- if (!sched_fair_runnable(rq))
|
|
+ if (unlikely(!sched_fair_runnable(rq)))
|
|
goto idle;
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- if (!prev || prev->sched_class != &fair_sched_class)
|
|
- goto simple;
|
|
-
|
|
- /*
|
|
- * Because of the set_next_buddy() in dequeue_task_fair() it is rather
|
|
- * likely that a next task is from the same cgroup as the current.
|
|
- *
|
|
- * Therefore attempt to avoid putting and setting the entire cgroup
|
|
- * hierarchy, only change the part that actually changes.
|
|
- */
|
|
+ se = next = cfs_rq->head;
|
|
+ next = next->next[DIR_RIGHT];
|
|
|
|
- do {
|
|
- struct sched_entity *curr = cfs_rq->curr;
|
|
-
|
|
- /*
|
|
- * Since we got here without doing put_prev_entity() we also
|
|
- * have to consider cfs_rq->curr. If it is still a runnable
|
|
- * entity, update_curr() will update its vruntime, otherwise
|
|
- * forget we've ever seen it.
|
|
- */
|
|
- if (curr) {
|
|
- if (curr->on_rq)
|
|
- update_curr(cfs_rq);
|
|
- else
|
|
- curr = NULL;
|
|
-
|
|
- /*
|
|
- * This call to check_cfs_rq_runtime() will do the
|
|
- * throttle and dequeue its entity in the parent(s).
|
|
- * Therefore the nr_running test will indeed
|
|
- * be correct.
|
|
- */
|
|
- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
|
|
- cfs_rq = &rq->cfs;
|
|
-
|
|
- if (!cfs_rq->nr_running)
|
|
- goto idle;
|
|
-
|
|
- goto simple;
|
|
- }
|
|
- }
|
|
-
|
|
- se = pick_next_entity(cfs_rq, curr);
|
|
- cfs_rq = group_cfs_rq(se);
|
|
- } while (cfs_rq);
|
|
-
|
|
- p = task_of(se);
|
|
-
|
|
- /*
|
|
- * Since we haven't yet done put_prev_entity and if the selected task
|
|
- * is a different task than we started out with, try and touch the
|
|
- * least amount of cfs_rqs.
|
|
- */
|
|
- if (prev != p) {
|
|
- struct sched_entity *pse = &prev->se;
|
|
-
|
|
- while (!(cfs_rq = is_same_group(se, pse))) {
|
|
- int se_depth = se->depth;
|
|
- int pse_depth = pse->depth;
|
|
-
|
|
- if (se_depth <= pse_depth) {
|
|
- put_prev_entity(cfs_rq_of(pse), pse);
|
|
- pse = parent_entity(pse);
|
|
- }
|
|
- if (se_depth >= pse_depth) {
|
|
- set_next_entity(cfs_rq_of(se), se);
|
|
- se = parent_entity(se);
|
|
- }
|
|
- }
|
|
+ while (next)
|
|
+ {
|
|
+ if (wakeup_preempt_entity(now, se, next) == 1)
|
|
+ se = next;
|
|
|
|
- put_prev_entity(cfs_rq, pse);
|
|
- set_next_entity(cfs_rq, se);
|
|
+ next = next->next[DIR_RIGHT];
|
|
}
|
|
|
|
- goto done;
|
|
-simple:
|
|
-#endif
|
|
- if (prev)
|
|
- put_prev_task(rq, prev);
|
|
-
|
|
- do {
|
|
- se = pick_next_entity(cfs_rq, NULL);
|
|
set_next_entity(cfs_rq, se);
|
|
- cfs_rq = group_cfs_rq(se);
|
|
- } while (cfs_rq);
|
|
|
|
p = task_of(se);
|
|
|
|
-done: __maybe_unused;
|
|
+ se->quantom = 0;
|
|
+
|
|
#ifdef CONFIG_SMP
|
|
/*
|
|
* Move the next running task to the front of
|
|
@@ -7075,19 +5990,6 @@
|
|
if (!rf)
|
|
return NULL;
|
|
|
|
- new_tasks = newidle_balance(rq, rf);
|
|
-
|
|
- /*
|
|
- * Because newidle_balance() releases (and re-acquires) rq->lock, it is
|
|
- * possible for any higher priority task to appear. In that case we
|
|
- * must re-start the pick_next_entity() loop.
|
|
- */
|
|
- if (new_tasks < 0)
|
|
- return RETRY_TASK;
|
|
-
|
|
- if (new_tasks > 0)
|
|
- goto again;
|
|
-
|
|
/*
|
|
* rq is about to be idle, check if we need to update the
|
|
* lost_idle_time of clock_pelt
|
|
@@ -7125,7 +6027,6 @@
|
|
{
|
|
struct task_struct *curr = rq->curr;
|
|
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
|
|
- struct sched_entity *se = &curr->se;
|
|
|
|
/*
|
|
* Are we the only task in the tree?
|
|
@@ -7133,8 +6034,6 @@
|
|
if (unlikely(rq->nr_running == 1))
|
|
return;
|
|
|
|
- clear_buddies(cfs_rq, se);
|
|
-
|
|
if (curr->policy != SCHED_BATCH) {
|
|
update_rq_clock(rq);
|
|
/*
|
|
@@ -7148,8 +6047,6 @@
|
|
*/
|
|
rq_clock_skip_update(rq);
|
|
}
|
|
-
|
|
- set_skip_buddy(se);
|
|
}
|
|
|
|
static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
|
|
@@ -7160,9 +6057,6 @@
|
|
if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
|
|
return false;
|
|
|
|
- /* Tell the scheduler that we'd really like pse to run next. */
|
|
- set_next_buddy(se);
|
|
-
|
|
yield_task_fair(rq);
|
|
|
|
return true;
|
|
@@ -7370,39 +6264,6 @@
|
|
struct list_head tasks;
|
|
};
|
|
|
|
-/*
|
|
- * Is this task likely cache-hot:
|
|
- */
|
|
-static int task_hot(struct task_struct *p, struct lb_env *env)
|
|
-{
|
|
- s64 delta;
|
|
-
|
|
- lockdep_assert_held(&env->src_rq->lock);
|
|
-
|
|
- if (p->sched_class != &fair_sched_class)
|
|
- return 0;
|
|
-
|
|
- if (unlikely(task_has_idle_policy(p)))
|
|
- return 0;
|
|
-
|
|
- /*
|
|
- * Buddy candidates are cache hot:
|
|
- */
|
|
- if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
|
|
- (&p->se == cfs_rq_of(&p->se)->next ||
|
|
- &p->se == cfs_rq_of(&p->se)->last))
|
|
- return 1;
|
|
-
|
|
- if (sysctl_sched_migration_cost == -1)
|
|
- return 1;
|
|
- if (sysctl_sched_migration_cost == 0)
|
|
- return 0;
|
|
-
|
|
- delta = rq_clock_task(env->src_rq) - p->se.exec_start;
|
|
-
|
|
- return delta < (s64)sysctl_sched_migration_cost;
|
|
-}
|
|
-
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
/*
|
|
* Returns 1, if task migration degrades locality
|
|
@@ -7463,302 +6324,10 @@
|
|
}
|
|
#endif
|
|
|
|
-/*
|
|
- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
|
|
- */
|
|
-static
|
|
-int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
-{
|
|
- int tsk_cache_hot;
|
|
-
|
|
- lockdep_assert_held(&env->src_rq->lock);
|
|
-
|
|
- /*
|
|
- * We do not migrate tasks that are:
|
|
- * 1) throttled_lb_pair, or
|
|
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
|
|
- * 3) running (obviously), or
|
|
- * 4) are cache-hot on their current CPU.
|
|
- */
|
|
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
|
|
- return 0;
|
|
-
|
|
- if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
|
|
- int cpu;
|
|
-
|
|
- schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
|
|
-
|
|
- env->flags |= LBF_SOME_PINNED;
|
|
-
|
|
- /*
|
|
- * Remember if this task can be migrated to any other CPU in
|
|
- * our sched_group. We may want to revisit it if we couldn't
|
|
- * meet load balance goals by pulling other tasks on src_cpu.
|
|
- *
|
|
- * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
|
|
- * already computed one in current iteration.
|
|
- */
|
|
- if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
|
|
- return 0;
|
|
-
|
|
- /* Prevent to re-select dst_cpu via env's CPUs: */
|
|
- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
|
|
- if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
|
|
- env->flags |= LBF_DST_PINNED;
|
|
- env->new_dst_cpu = cpu;
|
|
- break;
|
|
- }
|
|
- }
|
|
-
|
|
- return 0;
|
|
- }
|
|
-
|
|
- /* Record that we found atleast one task that could run on dst_cpu */
|
|
- env->flags &= ~LBF_ALL_PINNED;
|
|
-
|
|
- if (task_running(env->src_rq, p)) {
|
|
- schedstat_inc(p->se.statistics.nr_failed_migrations_running);
|
|
- return 0;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Aggressive migration if:
|
|
- * 1) destination numa is preferred
|
|
- * 2) task is cache cold, or
|
|
- * 3) too many balance attempts have failed.
|
|
- */
|
|
- tsk_cache_hot = migrate_degrades_locality(p, env);
|
|
- if (tsk_cache_hot == -1)
|
|
- tsk_cache_hot = task_hot(p, env);
|
|
-
|
|
- if (tsk_cache_hot <= 0 ||
|
|
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
|
|
- if (tsk_cache_hot == 1) {
|
|
- schedstat_inc(env->sd->lb_hot_gained[env->idle]);
|
|
- schedstat_inc(p->se.statistics.nr_forced_migrations);
|
|
- }
|
|
- return 1;
|
|
- }
|
|
-
|
|
- schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
|
|
- return 0;
|
|
-}
|
|
-
|
|
-/*
|
|
- * detach_task() -- detach the task for the migration specified in env
|
|
- */
|
|
-static void detach_task(struct task_struct *p, struct lb_env *env)
|
|
-{
|
|
- lockdep_assert_held(&env->src_rq->lock);
|
|
-
|
|
- deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
|
|
- set_task_cpu(p, env->dst_cpu);
|
|
-}
|
|
|
|
-/*
|
|
- * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
|
|
- * part of active balancing operations within "domain".
|
|
- *
|
|
- * Returns a task if successful and NULL otherwise.
|
|
- */
|
|
-static struct task_struct *detach_one_task(struct lb_env *env)
|
|
-{
|
|
- struct task_struct *p;
|
|
-
|
|
- lockdep_assert_held(&env->src_rq->lock);
|
|
-
|
|
- list_for_each_entry_reverse(p,
|
|
- &env->src_rq->cfs_tasks, se.group_node) {
|
|
- if (!can_migrate_task(p, env))
|
|
- continue;
|
|
-
|
|
- detach_task(p, env);
|
|
-
|
|
- /*
|
|
- * Right now, this is only the second place where
|
|
- * lb_gained[env->idle] is updated (other is detach_tasks)
|
|
- * so we can safely collect stats here rather than
|
|
- * inside detach_tasks().
|
|
- */
|
|
- schedstat_inc(env->sd->lb_gained[env->idle]);
|
|
- return p;
|
|
- }
|
|
- return NULL;
|
|
-}
|
|
|
|
static const unsigned int sched_nr_migrate_break = 32;
|
|
|
|
-/*
|
|
- * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
|
|
- * busiest_rq, as part of a balancing operation within domain "sd".
|
|
- *
|
|
- * Returns number of detached tasks if successful and 0 otherwise.
|
|
- */
|
|
-static int detach_tasks(struct lb_env *env)
|
|
-{
|
|
- struct list_head *tasks = &env->src_rq->cfs_tasks;
|
|
- unsigned long util, load;
|
|
- struct task_struct *p;
|
|
- int detached = 0;
|
|
-
|
|
- lockdep_assert_held(&env->src_rq->lock);
|
|
-
|
|
- if (env->imbalance <= 0)
|
|
- return 0;
|
|
-
|
|
- while (!list_empty(tasks)) {
|
|
- /*
|
|
- * We don't want to steal all, otherwise we may be treated likewise,
|
|
- * which could at worst lead to a livelock crash.
|
|
- */
|
|
- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
|
|
- break;
|
|
-
|
|
- p = list_last_entry(tasks, struct task_struct, se.group_node);
|
|
-
|
|
- env->loop++;
|
|
- /* We've more or less seen every task there is, call it quits */
|
|
- if (env->loop > env->loop_max)
|
|
- break;
|
|
-
|
|
- /* take a breather every nr_migrate tasks */
|
|
- if (env->loop > env->loop_break) {
|
|
- env->loop_break += sched_nr_migrate_break;
|
|
- env->flags |= LBF_NEED_BREAK;
|
|
- break;
|
|
- }
|
|
-
|
|
- if (!can_migrate_task(p, env))
|
|
- goto next;
|
|
-
|
|
- switch (env->migration_type) {
|
|
- case migrate_load:
|
|
- load = task_h_load(p);
|
|
-
|
|
- if (sched_feat(LB_MIN) &&
|
|
- load < 16 && !env->sd->nr_balance_failed)
|
|
- goto next;
|
|
-
|
|
- /*
|
|
- * Make sure that we don't migrate too much load.
|
|
- * Nevertheless, let relax the constraint if
|
|
- * scheduler fails to find a good waiting task to
|
|
- * migrate.
|
|
- */
|
|
- if (load/2 > env->imbalance &&
|
|
- env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
|
|
- goto next;
|
|
-
|
|
- env->imbalance -= load;
|
|
- break;
|
|
-
|
|
- case migrate_util:
|
|
- util = task_util_est(p);
|
|
-
|
|
- if (util > env->imbalance)
|
|
- goto next;
|
|
-
|
|
- env->imbalance -= util;
|
|
- break;
|
|
-
|
|
- case migrate_task:
|
|
- env->imbalance--;
|
|
- break;
|
|
-
|
|
- case migrate_misfit:
|
|
- /* This is not a misfit task */
|
|
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
|
|
- goto next;
|
|
-
|
|
- env->imbalance = 0;
|
|
- break;
|
|
- }
|
|
-
|
|
- detach_task(p, env);
|
|
- list_add(&p->se.group_node, &env->tasks);
|
|
-
|
|
- detached++;
|
|
-
|
|
-#ifdef CONFIG_PREEMPTION
|
|
- /*
|
|
- * NEWIDLE balancing is a source of latency, so preemptible
|
|
- * kernels will stop after the first task is detached to minimize
|
|
- * the critical section.
|
|
- */
|
|
- if (env->idle == CPU_NEWLY_IDLE)
|
|
- break;
|
|
-#endif
|
|
-
|
|
- /*
|
|
- * We only want to steal up to the prescribed amount of
|
|
- * load/util/tasks.
|
|
- */
|
|
- if (env->imbalance <= 0)
|
|
- break;
|
|
-
|
|
- continue;
|
|
-next:
|
|
- list_move(&p->se.group_node, tasks);
|
|
- }
|
|
-
|
|
- /*
|
|
- * Right now, this is one of only two places we collect this stat
|
|
- * so we can safely collect detach_one_task() stats here rather
|
|
- * than inside detach_one_task().
|
|
- */
|
|
- schedstat_add(env->sd->lb_gained[env->idle], detached);
|
|
-
|
|
- return detached;
|
|
-}
|
|
-
|
|
-/*
|
|
- * attach_task() -- attach the task detached by detach_task() to its new rq.
|
|
- */
|
|
-static void attach_task(struct rq *rq, struct task_struct *p)
|
|
-{
|
|
- lockdep_assert_held(&rq->lock);
|
|
-
|
|
- BUG_ON(task_rq(p) != rq);
|
|
- activate_task(rq, p, ENQUEUE_NOCLOCK);
|
|
- check_preempt_curr(rq, p, 0);
|
|
-}
|
|
-
|
|
-/*
|
|
- * attach_one_task() -- attaches the task returned from detach_one_task() to
|
|
- * its new rq.
|
|
- */
|
|
-static void attach_one_task(struct rq *rq, struct task_struct *p)
|
|
-{
|
|
- struct rq_flags rf;
|
|
-
|
|
- rq_lock(rq, &rf);
|
|
- update_rq_clock(rq);
|
|
- attach_task(rq, p);
|
|
- rq_unlock(rq, &rf);
|
|
-}
|
|
-
|
|
-/*
|
|
- * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
|
|
- * new rq.
|
|
- */
|
|
-static void attach_tasks(struct lb_env *env)
|
|
-{
|
|
- struct list_head *tasks = &env->tasks;
|
|
- struct task_struct *p;
|
|
- struct rq_flags rf;
|
|
-
|
|
- rq_lock(env->dst_rq, &rf);
|
|
- update_rq_clock(env->dst_rq);
|
|
-
|
|
- while (!list_empty(tasks)) {
|
|
- p = list_first_entry(tasks, struct task_struct, se.group_node);
|
|
- list_del_init(&p->se.group_node);
|
|
-
|
|
- attach_task(env->dst_rq, p);
|
|
- }
|
|
-
|
|
- rq_unlock(env->dst_rq, &rf);
|
|
-}
|
|
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
|
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
|
|
@@ -9086,293 +7655,6 @@
|
|
) / SCHED_CAPACITY_SCALE;
|
|
}
|
|
|
|
-/******* find_busiest_group() helpers end here *********************/
|
|
-
|
|
-/*
|
|
- * Decision matrix according to the local and busiest group type:
|
|
- *
|
|
- * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
|
|
- * has_spare nr_idle balanced N/A N/A balanced balanced
|
|
- * fully_busy nr_idle nr_idle N/A N/A balanced balanced
|
|
- * misfit_task force N/A N/A N/A force force
|
|
- * asym_packing force force N/A N/A force force
|
|
- * imbalanced force force N/A N/A force force
|
|
- * overloaded force force N/A N/A force avg_load
|
|
- *
|
|
- * N/A : Not Applicable because already filtered while updating
|
|
- * statistics.
|
|
- * balanced : The system is balanced for these 2 groups.
|
|
- * force : Calculate the imbalance as load migration is probably needed.
|
|
- * avg_load : Only if imbalance is significant enough.
|
|
- * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
|
|
- * different in groups.
|
|
- */
|
|
-
|
|
-/**
|
|
- * find_busiest_group - Returns the busiest group within the sched_domain
|
|
- * if there is an imbalance.
|
|
- *
|
|
- * Also calculates the amount of runnable load which should be moved
|
|
- * to restore balance.
|
|
- *
|
|
- * @env: The load balancing environment.
|
|
- *
|
|
- * Return: - The busiest group if imbalance exists.
|
|
- */
|
|
-static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
-{
|
|
- struct sg_lb_stats *local, *busiest;
|
|
- struct sd_lb_stats sds;
|
|
-
|
|
- init_sd_lb_stats(&sds);
|
|
-
|
|
- /*
|
|
- * Compute the various statistics relevant for load balancing at
|
|
- * this level.
|
|
- */
|
|
- update_sd_lb_stats(env, &sds);
|
|
-
|
|
- if (sched_energy_enabled()) {
|
|
- struct root_domain *rd = env->dst_rq->rd;
|
|
-
|
|
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
|
|
- goto out_balanced;
|
|
- }
|
|
-
|
|
- local = &sds.local_stat;
|
|
- busiest = &sds.busiest_stat;
|
|
-
|
|
- /* There is no busy sibling group to pull tasks from */
|
|
- if (!sds.busiest)
|
|
- goto out_balanced;
|
|
-
|
|
- /* Misfit tasks should be dealt with regardless of the avg load */
|
|
- if (busiest->group_type == group_misfit_task)
|
|
- goto force_balance;
|
|
-
|
|
- /* ASYM feature bypasses nice load balance check */
|
|
- if (busiest->group_type == group_asym_packing)
|
|
- goto force_balance;
|
|
-
|
|
- /*
|
|
- * If the busiest group is imbalanced the below checks don't
|
|
- * work because they assume all things are equal, which typically
|
|
- * isn't true due to cpus_ptr constraints and the like.
|
|
- */
|
|
- if (busiest->group_type == group_imbalanced)
|
|
- goto force_balance;
|
|
-
|
|
- /*
|
|
- * If the local group is busier than the selected busiest group
|
|
- * don't try and pull any tasks.
|
|
- */
|
|
- if (local->group_type > busiest->group_type)
|
|
- goto out_balanced;
|
|
-
|
|
- /*
|
|
- * When groups are overloaded, use the avg_load to ensure fairness
|
|
- * between tasks.
|
|
- */
|
|
- if (local->group_type == group_overloaded) {
|
|
- /*
|
|
- * If the local group is more loaded than the selected
|
|
- * busiest group don't try to pull any tasks.
|
|
- */
|
|
- if (local->avg_load >= busiest->avg_load)
|
|
- goto out_balanced;
|
|
-
|
|
- /* XXX broken for overlapping NUMA groups */
|
|
- sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
|
|
- sds.total_capacity;
|
|
-
|
|
- /*
|
|
- * Don't pull any tasks if this group is already above the
|
|
- * domain average load.
|
|
- */
|
|
- if (local->avg_load >= sds.avg_load)
|
|
- goto out_balanced;
|
|
-
|
|
- /*
|
|
- * If the busiest group is more loaded, use imbalance_pct to be
|
|
- * conservative.
|
|
- */
|
|
- if (100 * busiest->avg_load <=
|
|
- env->sd->imbalance_pct * local->avg_load)
|
|
- goto out_balanced;
|
|
- }
|
|
-
|
|
- /* Try to move all excess tasks to child's sibling domain */
|
|
- if (sds.prefer_sibling && local->group_type == group_has_spare &&
|
|
- busiest->sum_nr_running > local->sum_nr_running + 1)
|
|
- goto force_balance;
|
|
-
|
|
- if (busiest->group_type != group_overloaded) {
|
|
- if (env->idle == CPU_NOT_IDLE)
|
|
- /*
|
|
- * If the busiest group is not overloaded (and as a
|
|
- * result the local one too) but this CPU is already
|
|
- * busy, let another idle CPU try to pull task.
|
|
- */
|
|
- goto out_balanced;
|
|
-
|
|
- if (busiest->group_weight > 1 &&
|
|
- local->idle_cpus <= (busiest->idle_cpus + 1))
|
|
- /*
|
|
- * If the busiest group is not overloaded
|
|
- * and there is no imbalance between this and busiest
|
|
- * group wrt idle CPUs, it is balanced. The imbalance
|
|
- * becomes significant if the diff is greater than 1
|
|
- * otherwise we might end up to just move the imbalance
|
|
- * on another group. Of course this applies only if
|
|
- * there is more than 1 CPU per group.
|
|
- */
|
|
- goto out_balanced;
|
|
-
|
|
- if (busiest->sum_h_nr_running == 1)
|
|
- /*
|
|
- * busiest doesn't have any tasks waiting to run
|
|
- */
|
|
- goto out_balanced;
|
|
- }
|
|
-
|
|
-force_balance:
|
|
- /* Looks like there is an imbalance. Compute it */
|
|
- calculate_imbalance(env, &sds);
|
|
- return env->imbalance ? sds.busiest : NULL;
|
|
-
|
|
-out_balanced:
|
|
- env->imbalance = 0;
|
|
- return NULL;
|
|
-}
|
|
-
|
|
-/*
|
|
- * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
|
|
- */
|
|
-static struct rq *find_busiest_queue(struct lb_env *env,
|
|
- struct sched_group *group)
|
|
-{
|
|
- struct rq *busiest = NULL, *rq;
|
|
- unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
|
|
- unsigned int busiest_nr = 0;
|
|
- int i;
|
|
-
|
|
- for_each_cpu_and(i, sched_group_span(group), env->cpus) {
|
|
- unsigned long capacity, load, util;
|
|
- unsigned int nr_running;
|
|
- enum fbq_type rt;
|
|
-
|
|
- rq = cpu_rq(i);
|
|
- rt = fbq_classify_rq(rq);
|
|
-
|
|
- /*
|
|
- * We classify groups/runqueues into three groups:
|
|
- * - regular: there are !numa tasks
|
|
- * - remote: there are numa tasks that run on the 'wrong' node
|
|
- * - all: there is no distinction
|
|
- *
|
|
- * In order to avoid migrating ideally placed numa tasks,
|
|
- * ignore those when there's better options.
|
|
- *
|
|
- * If we ignore the actual busiest queue to migrate another
|
|
- * task, the next balance pass can still reduce the busiest
|
|
- * queue by moving tasks around inside the node.
|
|
- *
|
|
- * If we cannot move enough load due to this classification
|
|
- * the next pass will adjust the group classification and
|
|
- * allow migration of more tasks.
|
|
- *
|
|
- * Both cases only affect the total convergence complexity.
|
|
- */
|
|
- if (rt > env->fbq_type)
|
|
- continue;
|
|
-
|
|
- capacity = capacity_of(i);
|
|
- nr_running = rq->cfs.h_nr_running;
|
|
-
|
|
- /*
|
|
- * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
|
|
- * eventually lead to active_balancing high->low capacity.
|
|
- * Higher per-CPU capacity is considered better than balancing
|
|
- * average load.
|
|
- */
|
|
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
|
|
- capacity_of(env->dst_cpu) < capacity &&
|
|
- nr_running == 1)
|
|
- continue;
|
|
-
|
|
- switch (env->migration_type) {
|
|
- case migrate_load:
|
|
- /*
|
|
- * When comparing with load imbalance, use cpu_load()
|
|
- * which is not scaled with the CPU capacity.
|
|
- */
|
|
- load = cpu_load(rq);
|
|
-
|
|
- if (nr_running == 1 && load > env->imbalance &&
|
|
- !check_cpu_capacity(rq, env->sd))
|
|
- break;
|
|
-
|
|
- /*
|
|
- * For the load comparisons with the other CPUs,
|
|
- * consider the cpu_load() scaled with the CPU
|
|
- * capacity, so that the load can be moved away
|
|
- * from the CPU that is potentially running at a
|
|
- * lower capacity.
|
|
- *
|
|
- * Thus we're looking for max(load_i / capacity_i),
|
|
- * crosswise multiplication to rid ourselves of the
|
|
- * division works out to:
|
|
- * load_i * capacity_j > load_j * capacity_i;
|
|
- * where j is our previous maximum.
|
|
- */
|
|
- if (load * busiest_capacity > busiest_load * capacity) {
|
|
- busiest_load = load;
|
|
- busiest_capacity = capacity;
|
|
- busiest = rq;
|
|
- }
|
|
- break;
|
|
-
|
|
- case migrate_util:
|
|
- util = cpu_util(cpu_of(rq));
|
|
-
|
|
- /*
|
|
- * Don't try to pull utilization from a CPU with one
|
|
- * running task. Whatever its utilization, we will fail
|
|
- * detach the task.
|
|
- */
|
|
- if (nr_running <= 1)
|
|
- continue;
|
|
-
|
|
- if (busiest_util < util) {
|
|
- busiest_util = util;
|
|
- busiest = rq;
|
|
- }
|
|
- break;
|
|
-
|
|
- case migrate_task:
|
|
- if (busiest_nr < nr_running) {
|
|
- busiest_nr = nr_running;
|
|
- busiest = rq;
|
|
- }
|
|
- break;
|
|
-
|
|
- case migrate_misfit:
|
|
- /*
|
|
- * For ASYM_CPUCAPACITY domains with misfit tasks we
|
|
- * simply seek the "biggest" misfit task.
|
|
- */
|
|
- if (rq->misfit_task_load > busiest_load) {
|
|
- busiest_load = rq->misfit_task_load;
|
|
- busiest = rq;
|
|
- }
|
|
-
|
|
- break;
|
|
-
|
|
- }
|
|
- }
|
|
-
|
|
- return busiest;
|
|
-}
|
|
|
|
/*
|
|
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
|
|
@@ -9419,334 +7701,6 @@
|
|
return 0;
|
|
}
|
|
|
|
-static int need_active_balance(struct lb_env *env)
|
|
-{
|
|
- struct sched_domain *sd = env->sd;
|
|
-
|
|
- if (voluntary_active_balance(env))
|
|
- return 1;
|
|
-
|
|
- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
|
-}
|
|
-
|
|
-static int active_load_balance_cpu_stop(void *data);
|
|
-
|
|
-static int should_we_balance(struct lb_env *env)
|
|
-{
|
|
- struct sched_group *sg = env->sd->groups;
|
|
- int cpu, balance_cpu = -1;
|
|
-
|
|
- /*
|
|
- * Ensure the balancing environment is consistent; can happen
|
|
- * when the softirq triggers 'during' hotplug.
|
|
- */
|
|
- if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
|
|
- return 0;
|
|
-
|
|
- /*
|
|
- * In the newly idle case, we will allow all the CPUs
|
|
- * to do the newly idle load balance.
|
|
- */
|
|
- if (env->idle == CPU_NEWLY_IDLE)
|
|
- return 1;
|
|
-
|
|
- /* Try to find first idle CPU */
|
|
- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
|
|
- if (!idle_cpu(cpu))
|
|
- continue;
|
|
-
|
|
- balance_cpu = cpu;
|
|
- break;
|
|
- }
|
|
-
|
|
- if (balance_cpu == -1)
|
|
- balance_cpu = group_balance_cpu(sg);
|
|
-
|
|
- /*
|
|
- * First idle CPU or the first CPU(busiest) in this sched group
|
|
- * is eligible for doing load balancing at this and above domains.
|
|
- */
|
|
- return balance_cpu == env->dst_cpu;
|
|
-}
|
|
-
|
|
-/*
|
|
- * Check this_cpu to ensure it is balanced within domain. Attempt to move
|
|
- * tasks if there is an imbalance.
|
|
- */
|
|
-static int load_balance(int this_cpu, struct rq *this_rq,
|
|
- struct sched_domain *sd, enum cpu_idle_type idle,
|
|
- int *continue_balancing)
|
|
-{
|
|
- int ld_moved, cur_ld_moved, active_balance = 0;
|
|
- struct sched_domain *sd_parent = sd->parent;
|
|
- struct sched_group *group;
|
|
- struct rq *busiest;
|
|
- struct rq_flags rf;
|
|
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
|
|
-
|
|
- struct lb_env env = {
|
|
- .sd = sd,
|
|
- .dst_cpu = this_cpu,
|
|
- .dst_rq = this_rq,
|
|
- .dst_grpmask = sched_group_span(sd->groups),
|
|
- .idle = idle,
|
|
- .loop_break = sched_nr_migrate_break,
|
|
- .cpus = cpus,
|
|
- .fbq_type = all,
|
|
- .tasks = LIST_HEAD_INIT(env.tasks),
|
|
- };
|
|
-
|
|
- cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
|
|
-
|
|
- schedstat_inc(sd->lb_count[idle]);
|
|
-
|
|
-redo:
|
|
- if (!should_we_balance(&env)) {
|
|
- *continue_balancing = 0;
|
|
- goto out_balanced;
|
|
- }
|
|
-
|
|
- group = find_busiest_group(&env);
|
|
- if (!group) {
|
|
- schedstat_inc(sd->lb_nobusyg[idle]);
|
|
- goto out_balanced;
|
|
- }
|
|
-
|
|
- busiest = find_busiest_queue(&env, group);
|
|
- if (!busiest) {
|
|
- schedstat_inc(sd->lb_nobusyq[idle]);
|
|
- goto out_balanced;
|
|
- }
|
|
-
|
|
- BUG_ON(busiest == env.dst_rq);
|
|
-
|
|
- schedstat_add(sd->lb_imbalance[idle], env.imbalance);
|
|
-
|
|
- env.src_cpu = busiest->cpu;
|
|
- env.src_rq = busiest;
|
|
-
|
|
- ld_moved = 0;
|
|
- if (busiest->nr_running > 1) {
|
|
- /*
|
|
- * Attempt to move tasks. If find_busiest_group has found
|
|
- * an imbalance but busiest->nr_running <= 1, the group is
|
|
- * still unbalanced. ld_moved simply stays zero, so it is
|
|
- * correctly treated as an imbalance.
|
|
- */
|
|
- env.flags |= LBF_ALL_PINNED;
|
|
- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
|
|
-
|
|
-more_balance:
|
|
- rq_lock_irqsave(busiest, &rf);
|
|
- update_rq_clock(busiest);
|
|
-
|
|
- /*
|
|
- * cur_ld_moved - load moved in current iteration
|
|
- * ld_moved - cumulative load moved across iterations
|
|
- */
|
|
- cur_ld_moved = detach_tasks(&env);
|
|
-
|
|
- /*
|
|
- * We've detached some tasks from busiest_rq. Every
|
|
- * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
|
|
- * unlock busiest->lock, and we are able to be sure
|
|
- * that nobody can manipulate the tasks in parallel.
|
|
- * See task_rq_lock() family for the details.
|
|
- */
|
|
-
|
|
- rq_unlock(busiest, &rf);
|
|
-
|
|
- if (cur_ld_moved) {
|
|
- attach_tasks(&env);
|
|
- ld_moved += cur_ld_moved;
|
|
- }
|
|
-
|
|
- local_irq_restore(rf.flags);
|
|
-
|
|
- if (env.flags & LBF_NEED_BREAK) {
|
|
- env.flags &= ~LBF_NEED_BREAK;
|
|
- goto more_balance;
|
|
- }
|
|
-
|
|
- /*
|
|
- * Revisit (affine) tasks on src_cpu that couldn't be moved to
|
|
- * us and move them to an alternate dst_cpu in our sched_group
|
|
- * where they can run. The upper limit on how many times we
|
|
- * iterate on same src_cpu is dependent on number of CPUs in our
|
|
- * sched_group.
|
|
- *
|
|
- * This changes load balance semantics a bit on who can move
|
|
- * load to a given_cpu. In addition to the given_cpu itself
|
|
- * (or a ilb_cpu acting on its behalf where given_cpu is
|
|
- * nohz-idle), we now have balance_cpu in a position to move
|
|
- * load to given_cpu. In rare situations, this may cause
|
|
- * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
|
|
- * _independently_ and at _same_ time to move some load to
|
|
- * given_cpu) causing exceess load to be moved to given_cpu.
|
|
- * This however should not happen so much in practice and
|
|
- * moreover subsequent load balance cycles should correct the
|
|
- * excess load moved.
|
|
- */
|
|
- if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
|
|
-
|
|
- /* Prevent to re-select dst_cpu via env's CPUs */
|
|
- __cpumask_clear_cpu(env.dst_cpu, env.cpus);
|
|
-
|
|
- env.dst_rq = cpu_rq(env.new_dst_cpu);
|
|
- env.dst_cpu = env.new_dst_cpu;
|
|
- env.flags &= ~LBF_DST_PINNED;
|
|
- env.loop = 0;
|
|
- env.loop_break = sched_nr_migrate_break;
|
|
-
|
|
- /*
|
|
- * Go back to "more_balance" rather than "redo" since we
|
|
- * need to continue with same src_cpu.
|
|
- */
|
|
- goto more_balance;
|
|
- }
|
|
-
|
|
- /*
|
|
- * We failed to reach balance because of affinity.
|
|
- */
|
|
- if (sd_parent) {
|
|
- int *group_imbalance = &sd_parent->groups->sgc->imbalance;
|
|
-
|
|
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
|
|
- *group_imbalance = 1;
|
|
- }
|
|
-
|
|
- /* All tasks on this runqueue were pinned by CPU affinity */
|
|
- if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
|
- __cpumask_clear_cpu(cpu_of(busiest), cpus);
|
|
- /*
|
|
- * Attempting to continue load balancing at the current
|
|
- * sched_domain level only makes sense if there are
|
|
- * active CPUs remaining as possible busiest CPUs to
|
|
- * pull load from which are not contained within the
|
|
- * destination group that is receiving any migrated
|
|
- * load.
|
|
- */
|
|
- if (!cpumask_subset(cpus, env.dst_grpmask)) {
|
|
- env.loop = 0;
|
|
- env.loop_break = sched_nr_migrate_break;
|
|
- goto redo;
|
|
- }
|
|
- goto out_all_pinned;
|
|
- }
|
|
- }
|
|
-
|
|
- if (!ld_moved) {
|
|
- schedstat_inc(sd->lb_failed[idle]);
|
|
- /*
|
|
- * Increment the failure counter only on periodic balance.
|
|
- * We do not want newidle balance, which can be very
|
|
- * frequent, pollute the failure counter causing
|
|
- * excessive cache_hot migrations and active balances.
|
|
- */
|
|
- if (idle != CPU_NEWLY_IDLE)
|
|
- sd->nr_balance_failed++;
|
|
-
|
|
- if (need_active_balance(&env)) {
|
|
- unsigned long flags;
|
|
-
|
|
- raw_spin_lock_irqsave(&busiest->lock, flags);
|
|
-
|
|
- /*
|
|
- * Don't kick the active_load_balance_cpu_stop,
|
|
- * if the curr task on busiest CPU can't be
|
|
- * moved to this_cpu:
|
|
- */
|
|
- if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
|
|
- raw_spin_unlock_irqrestore(&busiest->lock,
|
|
- flags);
|
|
- env.flags |= LBF_ALL_PINNED;
|
|
- goto out_one_pinned;
|
|
- }
|
|
-
|
|
- /*
|
|
- * ->active_balance synchronizes accesses to
|
|
- * ->active_balance_work. Once set, it's cleared
|
|
- * only after active load balance is finished.
|
|
- */
|
|
- if (!busiest->active_balance) {
|
|
- busiest->active_balance = 1;
|
|
- busiest->push_cpu = this_cpu;
|
|
- active_balance = 1;
|
|
- }
|
|
- raw_spin_unlock_irqrestore(&busiest->lock, flags);
|
|
-
|
|
- if (active_balance) {
|
|
- stop_one_cpu_nowait(cpu_of(busiest),
|
|
- active_load_balance_cpu_stop, busiest,
|
|
- &busiest->active_balance_work);
|
|
- }
|
|
-
|
|
- /* We've kicked active balancing, force task migration. */
|
|
- sd->nr_balance_failed = sd->cache_nice_tries+1;
|
|
- }
|
|
- } else
|
|
- sd->nr_balance_failed = 0;
|
|
-
|
|
- if (likely(!active_balance) || voluntary_active_balance(&env)) {
|
|
- /* We were unbalanced, so reset the balancing interval */
|
|
- sd->balance_interval = sd->min_interval;
|
|
- } else {
|
|
- /*
|
|
- * If we've begun active balancing, start to back off. This
|
|
- * case may not be covered by the all_pinned logic if there
|
|
- * is only 1 task on the busy runqueue (because we don't call
|
|
- * detach_tasks).
|
|
- */
|
|
- if (sd->balance_interval < sd->max_interval)
|
|
- sd->balance_interval *= 2;
|
|
- }
|
|
-
|
|
- goto out;
|
|
-
|
|
-out_balanced:
|
|
- /*
|
|
- * We reach balance although we may have faced some affinity
|
|
- * constraints. Clear the imbalance flag only if other tasks got
|
|
- * a chance to move and fix the imbalance.
|
|
- */
|
|
- if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
|
|
- int *group_imbalance = &sd_parent->groups->sgc->imbalance;
|
|
-
|
|
- if (*group_imbalance)
|
|
- *group_imbalance = 0;
|
|
- }
|
|
-
|
|
-out_all_pinned:
|
|
- /*
|
|
- * We reach balance because all tasks are pinned at this level so
|
|
- * we can't migrate them. Let the imbalance flag set so parent level
|
|
- * can try to migrate them.
|
|
- */
|
|
- schedstat_inc(sd->lb_balanced[idle]);
|
|
-
|
|
- sd->nr_balance_failed = 0;
|
|
-
|
|
-out_one_pinned:
|
|
- ld_moved = 0;
|
|
-
|
|
- /*
|
|
- * newidle_balance() disregards balance intervals, so we could
|
|
- * repeatedly reach this code, which would lead to balance_interval
|
|
- * skyrocketting in a short amount of time. Skip the balance_interval
|
|
- * increase logic to avoid that.
|
|
- */
|
|
- if (env.idle == CPU_NEWLY_IDLE)
|
|
- goto out;
|
|
-
|
|
- /* tune up the balancing interval */
|
|
- if ((env.flags & LBF_ALL_PINNED &&
|
|
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
|
|
- sd->balance_interval < sd->max_interval)
|
|
- sd->balance_interval *= 2;
|
|
-out:
|
|
- return ld_moved;
|
|
-}
|
|
-
|
|
static inline unsigned long
|
|
get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
|
|
{
|
|
@@ -9776,99 +7730,6 @@
|
|
}
|
|
|
|
/*
|
|
- * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
|
|
- * running tasks off the busiest CPU onto idle CPUs. It requires at
|
|
- * least 1 task to be running on each physical CPU where possible, and
|
|
- * avoids physical / logical imbalances.
|
|
- */
|
|
-static int active_load_balance_cpu_stop(void *data)
|
|
-{
|
|
- struct rq *busiest_rq = data;
|
|
- int busiest_cpu = cpu_of(busiest_rq);
|
|
- int target_cpu = busiest_rq->push_cpu;
|
|
- struct rq *target_rq = cpu_rq(target_cpu);
|
|
- struct sched_domain *sd;
|
|
- struct task_struct *p = NULL;
|
|
- struct rq_flags rf;
|
|
-
|
|
- rq_lock_irq(busiest_rq, &rf);
|
|
- /*
|
|
- * Between queueing the stop-work and running it is a hole in which
|
|
- * CPUs can become inactive. We should not move tasks from or to
|
|
- * inactive CPUs.
|
|
- */
|
|
- if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
|
|
- goto out_unlock;
|
|
-
|
|
- /* Make sure the requested CPU hasn't gone down in the meantime: */
|
|
- if (unlikely(busiest_cpu != smp_processor_id() ||
|
|
- !busiest_rq->active_balance))
|
|
- goto out_unlock;
|
|
-
|
|
- /* Is there any task to move? */
|
|
- if (busiest_rq->nr_running <= 1)
|
|
- goto out_unlock;
|
|
-
|
|
- /*
|
|
- * This condition is "impossible", if it occurs
|
|
- * we need to fix it. Originally reported by
|
|
- * Bjorn Helgaas on a 128-CPU setup.
|
|
- */
|
|
- BUG_ON(busiest_rq == target_rq);
|
|
-
|
|
- /* Search for an sd spanning us and the target CPU. */
|
|
- rcu_read_lock();
|
|
- for_each_domain(target_cpu, sd) {
|
|
- if ((sd->flags & SD_LOAD_BALANCE) &&
|
|
- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
|
|
- break;
|
|
- }
|
|
-
|
|
- if (likely(sd)) {
|
|
- struct lb_env env = {
|
|
- .sd = sd,
|
|
- .dst_cpu = target_cpu,
|
|
- .dst_rq = target_rq,
|
|
- .src_cpu = busiest_rq->cpu,
|
|
- .src_rq = busiest_rq,
|
|
- .idle = CPU_IDLE,
|
|
- /*
|
|
- * can_migrate_task() doesn't need to compute new_dst_cpu
|
|
- * for active balancing. Since we have CPU_IDLE, but no
|
|
- * @dst_grpmask we need to make that test go away with lying
|
|
- * about DST_PINNED.
|
|
- */
|
|
- .flags = LBF_DST_PINNED,
|
|
- };
|
|
-
|
|
- schedstat_inc(sd->alb_count);
|
|
- update_rq_clock(busiest_rq);
|
|
-
|
|
- p = detach_one_task(&env);
|
|
- if (p) {
|
|
- schedstat_inc(sd->alb_pushed);
|
|
- /* Active balancing done, reset the failure counter. */
|
|
- sd->nr_balance_failed = 0;
|
|
- } else {
|
|
- schedstat_inc(sd->alb_failed);
|
|
- }
|
|
- }
|
|
- rcu_read_unlock();
|
|
-out_unlock:
|
|
- busiest_rq->active_balance = 0;
|
|
- rq_unlock(busiest_rq, &rf);
|
|
-
|
|
- if (p)
|
|
- attach_one_task(target_rq, p);
|
|
-
|
|
- local_irq_enable();
|
|
-
|
|
- return 0;
|
|
-}
|
|
-
|
|
-static DEFINE_SPINLOCK(balancing);
|
|
-
|
|
-/*
|
|
* Scale the max load_balance interval with the number of CPUs in the system.
|
|
* This trades load-balance latency on larger machines for less cross talk.
|
|
*/
|
|
@@ -9877,114 +7738,6 @@
|
|
max_load_balance_interval = HZ*num_online_cpus()/10;
|
|
}
|
|
|
|
-/*
|
|
- * It checks each scheduling domain to see if it is due to be balanced,
|
|
- * and initiates a balancing operation if so.
|
|
- *
|
|
- * Balancing parameters are set up in init_sched_domains.
|
|
- */
|
|
-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
|
-{
|
|
- int continue_balancing = 1;
|
|
- int cpu = rq->cpu;
|
|
- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
|
|
- unsigned long interval;
|
|
- struct sched_domain *sd;
|
|
- /* Earliest time when we have to do rebalance again */
|
|
- unsigned long next_balance = jiffies + 60*HZ;
|
|
- int update_next_balance = 0;
|
|
- int need_serialize, need_decay = 0;
|
|
- u64 max_cost = 0;
|
|
-
|
|
- rcu_read_lock();
|
|
- for_each_domain(cpu, sd) {
|
|
- /*
|
|
- * Decay the newidle max times here because this is a regular
|
|
- * visit to all the domains. Decay ~1% per second.
|
|
- */
|
|
- if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
|
|
- sd->max_newidle_lb_cost =
|
|
- (sd->max_newidle_lb_cost * 253) / 256;
|
|
- sd->next_decay_max_lb_cost = jiffies + HZ;
|
|
- need_decay = 1;
|
|
- }
|
|
- max_cost += sd->max_newidle_lb_cost;
|
|
-
|
|
- if (!(sd->flags & SD_LOAD_BALANCE))
|
|
- continue;
|
|
-
|
|
- /*
|
|
- * Stop the load balance at this level. There is another
|
|
- * CPU in our sched group which is doing load balancing more
|
|
- * actively.
|
|
- */
|
|
- if (!continue_balancing) {
|
|
- if (need_decay)
|
|
- continue;
|
|
- break;
|
|
- }
|
|
-
|
|
- interval = get_sd_balance_interval(sd, busy);
|
|
-
|
|
- need_serialize = sd->flags & SD_SERIALIZE;
|
|
- if (need_serialize) {
|
|
- if (!spin_trylock(&balancing))
|
|
- goto out;
|
|
- }
|
|
-
|
|
- if (time_after_eq(jiffies, sd->last_balance + interval)) {
|
|
- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
|
|
- /*
|
|
- * The LBF_DST_PINNED logic could have changed
|
|
- * env->dst_cpu, so we can't know our idle
|
|
- * state even if we migrated tasks. Update it.
|
|
- */
|
|
- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
|
|
- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
|
|
- }
|
|
- sd->last_balance = jiffies;
|
|
- interval = get_sd_balance_interval(sd, busy);
|
|
- }
|
|
- if (need_serialize)
|
|
- spin_unlock(&balancing);
|
|
-out:
|
|
- if (time_after(next_balance, sd->last_balance + interval)) {
|
|
- next_balance = sd->last_balance + interval;
|
|
- update_next_balance = 1;
|
|
- }
|
|
- }
|
|
- if (need_decay) {
|
|
- /*
|
|
- * Ensure the rq-wide value also decays but keep it at a
|
|
- * reasonable floor to avoid funnies with rq->avg_idle.
|
|
- */
|
|
- rq->max_idle_balance_cost =
|
|
- max((u64)sysctl_sched_migration_cost, max_cost);
|
|
- }
|
|
- rcu_read_unlock();
|
|
-
|
|
- /*
|
|
- * next_balance will be updated only when there is a need.
|
|
- * When the cpu is attached to null domain for ex, it will not be
|
|
- * updated.
|
|
- */
|
|
- if (likely(update_next_balance)) {
|
|
- rq->next_balance = next_balance;
|
|
-
|
|
-#ifdef CONFIG_NO_HZ_COMMON
|
|
- /*
|
|
- * If this CPU has been elected to perform the nohz idle
|
|
- * balance. Other idle CPUs have already rebalanced with
|
|
- * nohz_idle_balance() and nohz.next_balance has been
|
|
- * updated accordingly. This CPU is now running the idle load
|
|
- * balance for itself and we need to update the
|
|
- * nohz.next_balance accordingly.
|
|
- */
|
|
- if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
|
|
- nohz.next_balance = rq->next_balance;
|
|
-#endif
|
|
- }
|
|
-}
|
|
|
|
static inline int on_null_domain(struct rq *rq)
|
|
{
|
|
@@ -10014,420 +7767,12 @@
|
|
return nr_cpu_ids;
|
|
}
|
|
|
|
-/*
|
|
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
|
|
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
|
|
- */
|
|
-static void kick_ilb(unsigned int flags)
|
|
-{
|
|
- int ilb_cpu;
|
|
-
|
|
- nohz.next_balance++;
|
|
-
|
|
- ilb_cpu = find_new_ilb();
|
|
-
|
|
- if (ilb_cpu >= nr_cpu_ids)
|
|
- return;
|
|
-
|
|
- flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
|
|
- if (flags & NOHZ_KICK_MASK)
|
|
- return;
|
|
-
|
|
- /*
|
|
- * Use smp_send_reschedule() instead of resched_cpu().
|
|
- * This way we generate a sched IPI on the target CPU which
|
|
- * is idle. And the softirq performing nohz idle load balance
|
|
- * will be run before returning from the IPI.
|
|
- */
|
|
- smp_send_reschedule(ilb_cpu);
|
|
-}
|
|
-
|
|
-/*
|
|
- * Current decision point for kicking the idle load balancer in the presence
|
|
- * of idle CPUs in the system.
|
|
- */
|
|
-static void nohz_balancer_kick(struct rq *rq)
|
|
-{
|
|
- unsigned long now = jiffies;
|
|
- struct sched_domain_shared *sds;
|
|
- struct sched_domain *sd;
|
|
- int nr_busy, i, cpu = rq->cpu;
|
|
- unsigned int flags = 0;
|
|
-
|
|
- if (unlikely(rq->idle_balance))
|
|
- return;
|
|
-
|
|
- /*
|
|
- * We may be recently in ticked or tickless idle mode. At the first
|
|
- * busy tick after returning from idle, we will update the busy stats.
|
|
- */
|
|
- nohz_balance_exit_idle(rq);
|
|
-
|
|
- /*
|
|
- * None are in tickless mode and hence no need for NOHZ idle load
|
|
- * balancing.
|
|
- */
|
|
- if (likely(!atomic_read(&nohz.nr_cpus)))
|
|
- return;
|
|
-
|
|
- if (READ_ONCE(nohz.has_blocked) &&
|
|
- time_after(now, READ_ONCE(nohz.next_blocked)))
|
|
- flags = NOHZ_STATS_KICK;
|
|
-
|
|
- if (time_before(now, nohz.next_balance))
|
|
- goto out;
|
|
-
|
|
- if (rq->nr_running >= 2) {
|
|
- flags = NOHZ_KICK_MASK;
|
|
- goto out;
|
|
- }
|
|
-
|
|
- rcu_read_lock();
|
|
-
|
|
- sd = rcu_dereference(rq->sd);
|
|
- if (sd) {
|
|
- /*
|
|
- * If there's a CFS task and the current CPU has reduced
|
|
- * capacity; kick the ILB to see if there's a better CPU to run
|
|
- * on.
|
|
- */
|
|
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
|
|
- flags = NOHZ_KICK_MASK;
|
|
- goto unlock;
|
|
- }
|
|
- }
|
|
-
|
|
- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
|
|
- if (sd) {
|
|
- /*
|
|
- * When ASYM_PACKING; see if there's a more preferred CPU
|
|
- * currently idle; in which case, kick the ILB to move tasks
|
|
- * around.
|
|
- */
|
|
- for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
|
|
- if (sched_asym_prefer(i, cpu)) {
|
|
- flags = NOHZ_KICK_MASK;
|
|
- goto unlock;
|
|
- }
|
|
- }
|
|
- }
|
|
-
|
|
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
|
|
- if (sd) {
|
|
- /*
|
|
- * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
|
|
- * to run the misfit task on.
|
|
- */
|
|
- if (check_misfit_status(rq, sd)) {
|
|
- flags = NOHZ_KICK_MASK;
|
|
- goto unlock;
|
|
- }
|
|
-
|
|
- /*
|
|
- * For asymmetric systems, we do not want to nicely balance
|
|
- * cache use, instead we want to embrace asymmetry and only
|
|
- * ensure tasks have enough CPU capacity.
|
|
- *
|
|
- * Skip the LLC logic because it's not relevant in that case.
|
|
- */
|
|
- goto unlock;
|
|
- }
|
|
-
|
|
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
|
|
- if (sds) {
|
|
- /*
|
|
- * If there is an imbalance between LLC domains (IOW we could
|
|
- * increase the overall cache use), we need some less-loaded LLC
|
|
- * domain to pull some load. Likewise, we may need to spread
|
|
- * load within the current LLC domain (e.g. packed SMT cores but
|
|
- * other CPUs are idle). We can't really know from here how busy
|
|
- * the others are - so just get a nohz balance going if it looks
|
|
- * like this LLC domain has tasks we could move.
|
|
- */
|
|
- nr_busy = atomic_read(&sds->nr_busy_cpus);
|
|
- if (nr_busy > 1) {
|
|
- flags = NOHZ_KICK_MASK;
|
|
- goto unlock;
|
|
- }
|
|
- }
|
|
-unlock:
|
|
- rcu_read_unlock();
|
|
-out:
|
|
- if (flags)
|
|
- kick_ilb(flags);
|
|
-}
|
|
-
|
|
-static void set_cpu_sd_state_busy(int cpu)
|
|
-{
|
|
- struct sched_domain *sd;
|
|
-
|
|
- rcu_read_lock();
|
|
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
|
-
|
|
- if (!sd || !sd->nohz_idle)
|
|
- goto unlock;
|
|
- sd->nohz_idle = 0;
|
|
-
|
|
- atomic_inc(&sd->shared->nr_busy_cpus);
|
|
-unlock:
|
|
- rcu_read_unlock();
|
|
-}
|
|
-
|
|
void nohz_balance_exit_idle(struct rq *rq)
|
|
{
|
|
- SCHED_WARN_ON(rq != this_rq());
|
|
-
|
|
- if (likely(!rq->nohz_tick_stopped))
|
|
- return;
|
|
-
|
|
- rq->nohz_tick_stopped = 0;
|
|
- cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
|
|
- atomic_dec(&nohz.nr_cpus);
|
|
-
|
|
- set_cpu_sd_state_busy(rq->cpu);
|
|
}
|
|
|
|
-static void set_cpu_sd_state_idle(int cpu)
|
|
-{
|
|
- struct sched_domain *sd;
|
|
-
|
|
- rcu_read_lock();
|
|
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
|
|
-
|
|
- if (!sd || sd->nohz_idle)
|
|
- goto unlock;
|
|
- sd->nohz_idle = 1;
|
|
-
|
|
- atomic_dec(&sd->shared->nr_busy_cpus);
|
|
-unlock:
|
|
- rcu_read_unlock();
|
|
-}
|
|
-
|
|
-/*
|
|
- * This routine will record that the CPU is going idle with tick stopped.
|
|
- * This info will be used in performing idle load balancing in the future.
|
|
- */
|
|
void nohz_balance_enter_idle(int cpu)
|
|
{
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
-
|
|
- SCHED_WARN_ON(cpu != smp_processor_id());
|
|
-
|
|
- /* If this CPU is going down, then nothing needs to be done: */
|
|
- if (!cpu_active(cpu))
|
|
- return;
|
|
-
|
|
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
|
|
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
|
|
- return;
|
|
-
|
|
- /*
|
|
- * Can be set safely without rq->lock held
|
|
- * If a clear happens, it will have evaluated last additions because
|
|
- * rq->lock is held during the check and the clear
|
|
- */
|
|
- rq->has_blocked_load = 1;
|
|
-
|
|
- /*
|
|
- * The tick is still stopped but load could have been added in the
|
|
- * meantime. We set the nohz.has_blocked flag to trig a check of the
|
|
- * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
|
|
- * of nohz.has_blocked can only happen after checking the new load
|
|
- */
|
|
- if (rq->nohz_tick_stopped)
|
|
- goto out;
|
|
-
|
|
- /* If we're a completely isolated CPU, we don't play: */
|
|
- if (on_null_domain(rq))
|
|
- return;
|
|
-
|
|
- rq->nohz_tick_stopped = 1;
|
|
-
|
|
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
|
|
- atomic_inc(&nohz.nr_cpus);
|
|
-
|
|
- /*
|
|
- * Ensures that if nohz_idle_balance() fails to observe our
|
|
- * @idle_cpus_mask store, it must observe the @has_blocked
|
|
- * store.
|
|
- */
|
|
- smp_mb__after_atomic();
|
|
-
|
|
- set_cpu_sd_state_idle(cpu);
|
|
-
|
|
-out:
|
|
- /*
|
|
- * Each time a cpu enter idle, we assume that it has blocked load and
|
|
- * enable the periodic update of the load of idle cpus
|
|
- */
|
|
- WRITE_ONCE(nohz.has_blocked, 1);
|
|
-}
|
|
-
|
|
-/*
|
|
- * Internal function that runs load balance for all idle cpus. The load balance
|
|
- * can be a simple update of blocked load or a complete load balance with
|
|
- * tasks movement depending of flags.
|
|
- * The function returns false if the loop has stopped before running
|
|
- * through all idle CPUs.
|
|
- */
|
|
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
|
|
- enum cpu_idle_type idle)
|
|
-{
|
|
- /* Earliest time when we have to do rebalance again */
|
|
- unsigned long now = jiffies;
|
|
- unsigned long next_balance = now + 60*HZ;
|
|
- bool has_blocked_load = false;
|
|
- int update_next_balance = 0;
|
|
- int this_cpu = this_rq->cpu;
|
|
- int balance_cpu;
|
|
- int ret = false;
|
|
- struct rq *rq;
|
|
-
|
|
- SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
|
|
-
|
|
- /*
|
|
- * We assume there will be no idle load after this update and clear
|
|
- * the has_blocked flag. If a cpu enters idle in the mean time, it will
|
|
- * set the has_blocked flag and trig another update of idle load.
|
|
- * Because a cpu that becomes idle, is added to idle_cpus_mask before
|
|
- * setting the flag, we are sure to not clear the state and not
|
|
- * check the load of an idle cpu.
|
|
- */
|
|
- WRITE_ONCE(nohz.has_blocked, 0);
|
|
-
|
|
- /*
|
|
- * Ensures that if we miss the CPU, we must see the has_blocked
|
|
- * store from nohz_balance_enter_idle().
|
|
- */
|
|
- smp_mb();
|
|
-
|
|
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
|
|
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
|
|
- continue;
|
|
-
|
|
- /*
|
|
- * If this CPU gets work to do, stop the load balancing
|
|
- * work being done for other CPUs. Next load
|
|
- * balancing owner will pick it up.
|
|
- */
|
|
- if (need_resched()) {
|
|
- has_blocked_load = true;
|
|
- goto abort;
|
|
- }
|
|
-
|
|
- rq = cpu_rq(balance_cpu);
|
|
-
|
|
- has_blocked_load |= update_nohz_stats(rq, true);
|
|
-
|
|
- /*
|
|
- * If time for next balance is due,
|
|
- * do the balance.
|
|
- */
|
|
- if (time_after_eq(jiffies, rq->next_balance)) {
|
|
- struct rq_flags rf;
|
|
-
|
|
- rq_lock_irqsave(rq, &rf);
|
|
- update_rq_clock(rq);
|
|
- rq_unlock_irqrestore(rq, &rf);
|
|
-
|
|
- if (flags & NOHZ_BALANCE_KICK)
|
|
- rebalance_domains(rq, CPU_IDLE);
|
|
- }
|
|
-
|
|
- if (time_after(next_balance, rq->next_balance)) {
|
|
- next_balance = rq->next_balance;
|
|
- update_next_balance = 1;
|
|
- }
|
|
- }
|
|
-
|
|
- /* Newly idle CPU doesn't need an update */
|
|
- if (idle != CPU_NEWLY_IDLE) {
|
|
- update_blocked_averages(this_cpu);
|
|
- has_blocked_load |= this_rq->has_blocked_load;
|
|
- }
|
|
-
|
|
- if (flags & NOHZ_BALANCE_KICK)
|
|
- rebalance_domains(this_rq, CPU_IDLE);
|
|
-
|
|
- WRITE_ONCE(nohz.next_blocked,
|
|
- now + msecs_to_jiffies(LOAD_AVG_PERIOD));
|
|
-
|
|
- /* The full idle balance loop has been done */
|
|
- ret = true;
|
|
-
|
|
-abort:
|
|
- /* There is still blocked load, enable periodic update */
|
|
- if (has_blocked_load)
|
|
- WRITE_ONCE(nohz.has_blocked, 1);
|
|
-
|
|
- /*
|
|
- * next_balance will be updated only when there is a need.
|
|
- * When the CPU is attached to null domain for ex, it will not be
|
|
- * updated.
|
|
- */
|
|
- if (likely(update_next_balance))
|
|
- nohz.next_balance = next_balance;
|
|
-
|
|
- return ret;
|
|
-}
|
|
-
|
|
-/*
|
|
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
|
|
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
|
|
- */
|
|
-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
|
-{
|
|
- int this_cpu = this_rq->cpu;
|
|
- unsigned int flags;
|
|
-
|
|
- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
|
|
- return false;
|
|
-
|
|
- if (idle != CPU_IDLE) {
|
|
- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
|
|
- return false;
|
|
- }
|
|
-
|
|
- /* could be _relaxed() */
|
|
- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
|
|
- if (!(flags & NOHZ_KICK_MASK))
|
|
- return false;
|
|
-
|
|
- _nohz_idle_balance(this_rq, flags, idle);
|
|
-
|
|
- return true;
|
|
-}
|
|
-
|
|
-static void nohz_newidle_balance(struct rq *this_rq)
|
|
-{
|
|
- int this_cpu = this_rq->cpu;
|
|
-
|
|
- /*
|
|
- * This CPU doesn't want to be disturbed by scheduler
|
|
- * housekeeping
|
|
- */
|
|
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
|
|
- return;
|
|
-
|
|
- /* Will wake up very soon. No time for doing anything else*/
|
|
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
|
|
- return;
|
|
-
|
|
- /* Don't need to update blocked load of idle CPUs*/
|
|
- if (!READ_ONCE(nohz.has_blocked) ||
|
|
- time_before(jiffies, READ_ONCE(nohz.next_blocked)))
|
|
- return;
|
|
-
|
|
- raw_spin_unlock(&this_rq->lock);
|
|
- /*
|
|
- * This CPU is going to be idle and blocked load of idle CPUs
|
|
- * need to be updated. Run the ilb locally as it is a good
|
|
- * candidate for ilb instead of waking up another idle CPU.
|
|
- * Kick an normal ilb if we failed to do the update.
|
|
- */
|
|
- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
|
|
- kick_ilb(NOHZ_STATS_KICK);
|
|
- raw_spin_lock(&this_rq->lock);
|
|
}
|
|
|
|
#else /* !CONFIG_NO_HZ_COMMON */
|
|
@@ -10441,169 +7786,6 @@
|
|
static inline void nohz_newidle_balance(struct rq *this_rq) { }
|
|
#endif /* CONFIG_NO_HZ_COMMON */
|
|
|
|
-/*
|
|
- * idle_balance is called by schedule() if this_cpu is about to become
|
|
- * idle. Attempts to pull tasks from other CPUs.
|
|
- *
|
|
- * Returns:
|
|
- * < 0 - we released the lock and there are !fair tasks present
|
|
- * 0 - failed, no new tasks
|
|
- * > 0 - success, new (fair) tasks present
|
|
- */
|
|
-int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
|
|
-{
|
|
- unsigned long next_balance = jiffies + HZ;
|
|
- int this_cpu = this_rq->cpu;
|
|
- struct sched_domain *sd;
|
|
- int pulled_task = 0;
|
|
- u64 curr_cost = 0;
|
|
-
|
|
- update_misfit_status(NULL, this_rq);
|
|
- /*
|
|
- * We must set idle_stamp _before_ calling idle_balance(), such that we
|
|
- * measure the duration of idle_balance() as idle time.
|
|
- */
|
|
- this_rq->idle_stamp = rq_clock(this_rq);
|
|
-
|
|
- /*
|
|
- * Do not pull tasks towards !active CPUs...
|
|
- */
|
|
- if (!cpu_active(this_cpu))
|
|
- return 0;
|
|
-
|
|
- /*
|
|
- * This is OK, because current is on_cpu, which avoids it being picked
|
|
- * for load-balance and preemption/IRQs are still disabled avoiding
|
|
- * further scheduler activity on it and we're being very careful to
|
|
- * re-start the picking loop.
|
|
- */
|
|
- rq_unpin_lock(this_rq, rf);
|
|
-
|
|
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
|
|
- !READ_ONCE(this_rq->rd->overload)) {
|
|
-
|
|
- rcu_read_lock();
|
|
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
|
- if (sd)
|
|
- update_next_balance(sd, &next_balance);
|
|
- rcu_read_unlock();
|
|
-
|
|
- nohz_newidle_balance(this_rq);
|
|
-
|
|
- goto out;
|
|
- }
|
|
-
|
|
- raw_spin_unlock(&this_rq->lock);
|
|
-
|
|
- update_blocked_averages(this_cpu);
|
|
- rcu_read_lock();
|
|
- for_each_domain(this_cpu, sd) {
|
|
- int continue_balancing = 1;
|
|
- u64 t0, domain_cost;
|
|
-
|
|
- if (!(sd->flags & SD_LOAD_BALANCE))
|
|
- continue;
|
|
-
|
|
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
|
|
- update_next_balance(sd, &next_balance);
|
|
- break;
|
|
- }
|
|
-
|
|
- if (sd->flags & SD_BALANCE_NEWIDLE) {
|
|
- t0 = sched_clock_cpu(this_cpu);
|
|
-
|
|
- pulled_task = load_balance(this_cpu, this_rq,
|
|
- sd, CPU_NEWLY_IDLE,
|
|
- &continue_balancing);
|
|
-
|
|
- domain_cost = sched_clock_cpu(this_cpu) - t0;
|
|
- if (domain_cost > sd->max_newidle_lb_cost)
|
|
- sd->max_newidle_lb_cost = domain_cost;
|
|
-
|
|
- curr_cost += domain_cost;
|
|
- }
|
|
-
|
|
- update_next_balance(sd, &next_balance);
|
|
-
|
|
- /*
|
|
- * Stop searching for tasks to pull if there are
|
|
- * now runnable tasks on this rq.
|
|
- */
|
|
- if (pulled_task || this_rq->nr_running > 0)
|
|
- break;
|
|
- }
|
|
- rcu_read_unlock();
|
|
-
|
|
- raw_spin_lock(&this_rq->lock);
|
|
-
|
|
- if (curr_cost > this_rq->max_idle_balance_cost)
|
|
- this_rq->max_idle_balance_cost = curr_cost;
|
|
-
|
|
-out:
|
|
- /*
|
|
- * While browsing the domains, we released the rq lock, a task could
|
|
- * have been enqueued in the meantime. Since we're not going idle,
|
|
- * pretend we pulled a task.
|
|
- */
|
|
- if (this_rq->cfs.h_nr_running && !pulled_task)
|
|
- pulled_task = 1;
|
|
-
|
|
- /* Move the next balance forward */
|
|
- if (time_after(this_rq->next_balance, next_balance))
|
|
- this_rq->next_balance = next_balance;
|
|
-
|
|
- /* Is there a task of a high priority class? */
|
|
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
|
- pulled_task = -1;
|
|
-
|
|
- if (pulled_task)
|
|
- this_rq->idle_stamp = 0;
|
|
-
|
|
- rq_repin_lock(this_rq, rf);
|
|
-
|
|
- return pulled_task;
|
|
-}
|
|
-
|
|
-/*
|
|
- * run_rebalance_domains is triggered when needed from the scheduler tick.
|
|
- * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
|
|
- */
|
|
-static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
|
|
-{
|
|
- struct rq *this_rq = this_rq();
|
|
- enum cpu_idle_type idle = this_rq->idle_balance ?
|
|
- CPU_IDLE : CPU_NOT_IDLE;
|
|
-
|
|
- /*
|
|
- * If this CPU has a pending nohz_balance_kick, then do the
|
|
- * balancing on behalf of the other idle CPUs whose ticks are
|
|
- * stopped. Do nohz_idle_balance *before* rebalance_domains to
|
|
- * give the idle CPUs a chance to load balance. Else we may
|
|
- * load balance only within the local sched_domain hierarchy
|
|
- * and abort nohz_idle_balance altogether if we pull some load.
|
|
- */
|
|
- if (nohz_idle_balance(this_rq, idle))
|
|
- return;
|
|
-
|
|
- /* normal load balance */
|
|
- update_blocked_averages(this_rq->cpu);
|
|
- rebalance_domains(this_rq, idle);
|
|
-}
|
|
-
|
|
-/*
|
|
- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
|
|
- */
|
|
-void trigger_load_balance(struct rq *rq)
|
|
-{
|
|
- /* Don't need to rebalance while attached to NULL domain */
|
|
- if (unlikely(on_null_domain(rq)))
|
|
- return;
|
|
-
|
|
- if (time_after_eq(jiffies, rq->next_balance))
|
|
- raise_softirq(SCHED_SOFTIRQ);
|
|
-
|
|
- nohz_balancer_kick(rq);
|
|
-}
|
|
|
|
static void rq_online_fair(struct rq *rq)
|
|
{
|
|
@@ -10640,9 +7822,6 @@
|
|
entity_tick(cfs_rq, se, queued);
|
|
}
|
|
|
|
- if (static_branch_unlikely(&sched_numa_balancing))
|
|
- task_tick_numa(rq, curr);
|
|
-
|
|
update_misfit_status(curr, rq);
|
|
update_overutilized_status(task_rq(curr));
|
|
}
|
|
@@ -10655,7 +7834,7 @@
|
|
static void task_fork_fair(struct task_struct *p)
|
|
{
|
|
struct cfs_rq *cfs_rq;
|
|
- struct sched_entity *se = &p->se, *curr;
|
|
+ struct sched_entity *curr;
|
|
struct rq *rq = this_rq();
|
|
struct rq_flags rf;
|
|
|
|
@@ -10666,20 +7845,9 @@
|
|
curr = cfs_rq->curr;
|
|
if (curr) {
|
|
update_curr(cfs_rq);
|
|
- se->vruntime = curr->vruntime;
|
|
}
|
|
- place_entity(cfs_rq, se, 1);
|
|
|
|
- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
|
|
- /*
|
|
- * Upon rescheduling, sched_class::put_prev_task() will place
|
|
- * 'current' within the tree based on its new key value.
|
|
- */
|
|
- swap(curr->vruntime, se->vruntime);
|
|
- resched_curr(rq);
|
|
- }
|
|
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
rq_unlock(rq, &rf);
|
|
}
|
|
|
|
@@ -10708,58 +7876,9 @@
|
|
check_preempt_curr(rq, p, 0);
|
|
}
|
|
|
|
-static inline bool vruntime_normalized(struct task_struct *p)
|
|
-{
|
|
- struct sched_entity *se = &p->se;
|
|
-
|
|
- /*
|
|
- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
|
|
- * the dequeue_entity(.flags=0) will already have normalized the
|
|
- * vruntime.
|
|
- */
|
|
- if (p->on_rq)
|
|
- return true;
|
|
-
|
|
- /*
|
|
- * When !on_rq, vruntime of the task has usually NOT been normalized.
|
|
- * But there are some cases where it has already been normalized:
|
|
- *
|
|
- * - A forked child which is waiting for being woken up by
|
|
- * wake_up_new_task().
|
|
- * - A task which has been woken up by try_to_wake_up() and
|
|
- * waiting for actually being woken up by sched_ttwu_pending().
|
|
- */
|
|
- if (!se->sum_exec_runtime ||
|
|
- (p->state == TASK_WAKING && p->sched_remote_wakeup))
|
|
- return true;
|
|
-
|
|
- return false;
|
|
-}
|
|
-
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
-/*
|
|
- * Propagate the changes of the sched_entity across the tg tree to make it
|
|
- * visible to the root
|
|
- */
|
|
-static void propagate_entity_cfs_rq(struct sched_entity *se)
|
|
-{
|
|
- struct cfs_rq *cfs_rq;
|
|
-
|
|
- /* Start to propagate at parent */
|
|
- se = se->parent;
|
|
|
|
- for_each_sched_entity(se) {
|
|
- cfs_rq = cfs_rq_of(se);
|
|
-
|
|
- if (cfs_rq_throttled(cfs_rq))
|
|
- break;
|
|
-
|
|
- update_load_avg(cfs_rq, se, UPDATE_TG);
|
|
- }
|
|
-}
|
|
-#else
|
|
static void propagate_entity_cfs_rq(struct sched_entity *se) { }
|
|
-#endif
|
|
+
|
|
|
|
static void detach_entity_cfs_rq(struct sched_entity *se)
|
|
{
|
|
@@ -10776,14 +7895,6 @@
|
|
{
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- /*
|
|
- * Since the real-depth could have been changed (only FAIR
|
|
- * class maintain depth value), reset depth properly.
|
|
- */
|
|
- se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
-#endif
|
|
-
|
|
/* Synchronize entity with its cfs_rq */
|
|
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
|
|
attach_entity_load_avg(cfs_rq, se);
|
|
@@ -10794,29 +7905,13 @@
|
|
static void detach_task_cfs_rq(struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
-
|
|
- if (!vruntime_normalized(p)) {
|
|
- /*
|
|
- * Fix up our vruntime so that the current sleep doesn't
|
|
- * cause 'unlimited' sleep bonus.
|
|
- */
|
|
- place_entity(cfs_rq, se, 0);
|
|
- se->vruntime -= cfs_rq->min_vruntime;
|
|
- }
|
|
-
|
|
detach_entity_cfs_rq(se);
|
|
}
|
|
|
|
static void attach_task_cfs_rq(struct task_struct *p)
|
|
{
|
|
struct sched_entity *se = &p->se;
|
|
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
-
|
|
attach_entity_cfs_rq(se);
|
|
-
|
|
- if (!vruntime_normalized(p))
|
|
- se->vruntime += cfs_rq->min_vruntime;
|
|
}
|
|
|
|
static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
|
@@ -10879,6 +7974,8 @@
|
|
#ifdef CONFIG_SMP
|
|
raw_spin_lock_init(&cfs_rq->removed.lock);
|
|
#endif
|
|
+
|
|
+ cfs_rq->head = NULL;
|
|
}
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
@@ -11203,7 +8300,6 @@
|
|
__init void init_sched_fair_class(void)
|
|
{
|
|
#ifdef CONFIG_SMP
|
|
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
|
|
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
|
nohz.next_balance = jiffies;
|
|
diff --color -rubN linux-5.7.6/kernel/sched/sched.h linux-5.7.6.cachy/kernel/sched/sched.h
|
|
--- linux-5.7.6/kernel/sched/sched.h 2020-06-25 01:49:26.000000000 +1000
|
|
+++ linux-5.7.6.cachy/kernel/sched/sched.h 2020-07-24 17:52:04.479461959 +1000
|
|
@@ -516,6 +516,7 @@
|
|
* 'curr' points to currently running entity on this cfs_rq.
|
|
* It is set to NULL otherwise (i.e when none are currently running).
|
|
*/
|
|
+ struct sched_entity *head;
|
|
struct sched_entity *curr;
|
|
struct sched_entity *next;
|
|
struct sched_entity *last;
|
|
@@ -541,50 +542,7 @@
|
|
unsigned long runnable_avg;
|
|
} removed;
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- unsigned long tg_load_avg_contrib;
|
|
- long propagate;
|
|
- long prop_runnable_sum;
|
|
-
|
|
- /*
|
|
- * h_load = weight * f(tg)
|
|
- *
|
|
- * Where f(tg) is the recursive weight fraction assigned to
|
|
- * this group.
|
|
- */
|
|
- unsigned long h_load;
|
|
- u64 last_h_load_update;
|
|
- struct sched_entity *h_load_next;
|
|
-#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
#endif /* CONFIG_SMP */
|
|
-
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
|
|
-
|
|
- /*
|
|
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
|
|
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
|
|
- * (like users, containers etc.)
|
|
- *
|
|
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
|
|
- * This list is used during load balance.
|
|
- */
|
|
- int on_list;
|
|
- struct list_head leaf_cfs_rq_list;
|
|
- struct task_group *tg; /* group that "owns" this runqueue */
|
|
-
|
|
-#ifdef CONFIG_CFS_BANDWIDTH
|
|
- int runtime_enabled;
|
|
- s64 runtime_remaining;
|
|
-
|
|
- u64 throttled_clock;
|
|
- u64 throttled_clock_task;
|
|
- u64 throttled_clock_task_time;
|
|
- int throttled;
|
|
- int throttle_count;
|
|
- struct list_head throttled_list;
|
|
-#endif /* CONFIG_CFS_BANDWIDTH */
|
|
-#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
};
|
|
|
|
static inline int rt_bandwidth_enabled(void)
|
|
diff --color -rubN linux-5.7.6/Makefile linux-5.7.6.cachy/Makefile
|
|
--- linux-5.7.6/Makefile 2020-06-25 01:49:26.000000000 +1000
|
|
+++ linux-5.7.6.cachy/Makefile 2020-07-24 14:33:53.453645295 +1000
|
|
@@ -2,8 +2,8 @@
|
|
VERSION = 5
|
|
PATCHLEVEL = 7
|
|
SUBLEVEL = 6
|
|
-EXTRAVERSION =
|
|
-NAME = Kleptomaniac Octopus
|
|
+EXTRAVERSION = -cachy
|
|
+NAME = Cachy
|
|
|
|
# *DOCUMENTATION*
|
|
# To see a list of typical targets execute "make help"
|