diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1d56a6b73a4e..4d55ff02310c 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1087,6 +1087,10 @@ Model available). If your platform happens to meet the requirements for EAS but you do not want to use it, change this value to 0. +sched_interactivity_factor (CacULE scheduler only) +================================================== +Sets the value *m* for interactivity score calculations. See +Figure 1 in https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf sched_schedstats ================ diff --git a/Documentation/scheduler/sched-CacULE.rst b/Documentation/scheduler/sched-CacULE.rst new file mode 100644 index 000000000000..82b0847c468a --- /dev/null +++ b/Documentation/scheduler/sched-CacULE.rst @@ -0,0 +1,76 @@ +====================================== +The CacULE Scheduler by Hamad Al Marri. +====================================== + +1. Overview +============= + +The CacULE CPU scheduler is based on interactivity score mechanism. +The interactivity score is inspired by the ULE scheduler (FreeBSD +scheduler). + +1.1 About CacULE Scheduler +-------------------------- + + - Each CPU has its own runqueue. + + - NORMAL runqueue is a linked list of sched_entities (instead of RB-Tree). + + - RT and other runqueues are just the same as the CFS's. + + - Wake up tasks preempt currently running tasks if its interactivity score value + is higher. + + +1.2. Complexity +---------------- + +The complexity of Enqueue and Dequeue a task is O(1). + +The complexity of pick the next task is in O(n), where n is the number of tasks +in a runqueue (each CPU has its own runqueue). + +Note: O(n) sounds scary, but usually for a machine with 4 CPUS where it is used +for desktop or mobile jobs, the maximum number of runnable tasks might not +exceeds 10 (at the pick next run time) - the idle tasks are excluded since they +are dequeued when sleeping and enqueued when they wake up. + + +2. The CacULE Interactivity Score +======================================================= + +The interactivity score is inspired by the ULE scheduler (FreeBSD scheduler). +For more information see: https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf +CacULE doesn't replace CFS with ULE, it only changes the CFS' pick next task +mechanism to ULE's interactivity score mechanism for picking next task to run. + + +2.3 sched_interactivity_factor +================= +Sets the value *m* for interactivity score calculations. See Figure 1 in +https://web.cs.ucdavis.edu/~roper/ecs150/ULE.pdf +The default value of in CacULE is 10 which means that the Maximum Interactive +Score is 20 (since m = Maximum Interactive Score / 2). +You can tune sched_interactivity_factor with sysctl command: + + sysctl kernel.sched_interactivity_factor=50 + +This command changes the sched_interactivity_factor from 10 to 50. + + +3. Scheduling policies +======================= + +CacULE some CFS, implements three scheduling policies: + + - SCHED_NORMAL (traditionally called SCHED_OTHER): The scheduling + policy that is used for regular tasks. + + - SCHED_BATCH: Does not preempt nearly as often as regular tasks + would, thereby allowing tasks to run longer and make better use of + caches but at the cost of interactivity. This is well suited for + batch jobs. + + - SCHED_IDLE: This is even weaker than nice 19, but its not a true + idle timer scheduler in order to avoid to get into priority + inversion problems which would deadlock the machine. diff --git a/include/linux/sched.h b/include/linux/sched.h index edc01bcefbfd..9e16c9dd3d78 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -458,10 +458,22 @@ struct sched_statistics { #endif }; +#ifdef CONFIG_CACULE_SCHED +struct cacule_node { + struct cacule_node* next; + struct cacule_node* prev; + u64 cacule_start_time; + u64 vruntime; +}; +#endif + struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; +#ifdef CONFIG_CACULE_SCHED + struct cacule_node cacule_node; +#endif struct list_head group_node; unsigned int on_rq; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..4cf162341ab8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -31,6 +31,12 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_CACULE_SCHED +extern unsigned int interactivity_factor; +extern unsigned int interactivity_threshold; +extern unsigned int cacule_max_lifetime; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/init/Kconfig b/init/Kconfig index 5f5c776ef192..92330b5d8897 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -830,6 +830,17 @@ config UCLAMP_BUCKETS_COUNT endmenu +config CACULE_SCHED + bool "CacULE CPU scheduler" + default y + help + The CacULE CPU scheduler is based on interactivity score mechanism. + The interactivity score is inspired by the ULE scheduler (FreeBSD + scheduler). + + If unsure, say Y here. + + # # For architectures that want to enable the support for NUMA-affine scheduler # balancing logic: @@ -1213,6 +1224,7 @@ config SCHED_AUTOGROUP select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED + default y help This option optimizes the scheduler for common desktop workloads by automatically creating and populating task groups. This separation diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 38ef6d06888e..865f8dbddca8 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -46,6 +46,9 @@ choice 1000 Hz is the preferred choice for desktop systems and other systems requiring fast interactive responses to events. + config HZ_2000 + bool "2000 HZ" + endchoice config HZ @@ -54,6 +57,7 @@ config HZ default 250 if HZ_250 default 300 if HZ_300 default 1000 if HZ_1000 + default 2000 if HZ_2000 config SCHED_HRTICK def_bool HIGH_RES_TIMERS diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 814200541f8f..353f88cd05ca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3555,6 +3555,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + +#ifdef CONFIG_CACULE_SCHED + p->se.cacule_node.vruntime = 0; +#endif + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3840,6 +3845,10 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); +#ifdef CONFIG_CACULE_SCHED + p->se.cacule_node.cacule_start_time = sched_clock(); +#endif + activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -8094,6 +8103,10 @@ void __init sched_init(void) BUG_ON(&dl_sched_class + 1 != &stop_sched_class); #endif +#ifdef CONFIG_CACULE_SCHED + printk(KERN_INFO "CacULE CPU scheduler v5.12-r2 by Hamad Al Marri."); +#endif + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9e0a915e6eb8..77ac9cd82113 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -554,8 +554,11 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; + s64 MIN_vruntime = -1, max_vruntime = -1, +#if !defined(CONFIG_CACULE_SCHED) + min_vruntime, rq0_min_vruntime, spread0, +#endif + spread; struct rq *rq = cpu_rq(cpu); struct sched_entity *last; unsigned long flags; @@ -576,21 +579,27 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; +#if !defined(CONFIG_CACULE_SCHED) min_vruntime = cfs_rq->min_vruntime; rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; +#endif raw_spin_unlock_irqrestore(&rq->lock, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", SPLIT_NS(MIN_vruntime)); +#if !defined(CONFIG_CACULE_SCHED) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", SPLIT_NS(min_vruntime)); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", SPLIT_NS(max_vruntime)); spread = max_vruntime - MIN_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); +#if !defined(CONFIG_CACULE_SCHED) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); +#endif SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47fcc3fe9dc5..c0a60cc8d9ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,10 @@ * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * CacULE enhancements CPU cache and scheduler based on + * Interactivity Score. + * (C) 2020 Hamad Al Marri <hamad.s.almarri@gmail.com> */ #include "sched.h" @@ -82,7 +86,11 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; unsigned int sysctl_sched_wakeup_granularity = 1000000UL; static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +#ifdef CONFIG_CACULE_SCHED +const_debug unsigned int sysctl_sched_migration_cost = 200000UL; +#else const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#endif int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) @@ -113,6 +121,17 @@ int __weak arch_asym_cpu_priority(int cpu) */ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) +#endif +#ifdef CONFIG_CACULE_SCHED +unsigned int __read_mostly cacule_max_lifetime = 22000; // in ms +unsigned int __read_mostly interactivity_factor = 32768; + +#ifdef CONFIG_FAIR_GROUP_SCHED +unsigned int __read_mostly interactivity_threshold = 0; +#else +unsigned int __read_mostly interactivity_threshold = 1000; +#endif + #endif #ifdef CONFIG_CFS_BANDWIDTH @@ -253,6 +272,14 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight const struct sched_class fair_sched_class; + +#ifdef CONFIG_CACULE_SCHED +static inline struct sched_entity *se_of(struct cacule_node *cn) +{ + return container_of(cn, struct sched_entity, cacule_node); +} +#endif + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -512,7 +539,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: */ - +#if !defined(CONFIG_CACULE_SCHED) static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) { s64 delta = (s64)(vruntime - max_vruntime); @@ -575,7 +602,204 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) { return entity_before(__node_2_se(a), __node_2_se(b)); } +#endif /* CONFIG_CACULE_SCHED */ + +#ifdef CONFIG_CACULE_SCHED +static unsigned int +calc_interactivity(u64 now, struct cacule_node *se) +{ + u64 l_se, vr_se, sleep_se = 1ULL, u64_factor_m, _2m; + unsigned int score_se; + + /* + * in case of vruntime==0, logical OR with 1 would + * make sure that the least sig. bit is 1 + */ + l_se = now - se->cacule_start_time; + vr_se = se->vruntime | 1; + u64_factor_m = interactivity_factor; + _2m = u64_factor_m << 1; + + /* safety check */ + if (likely(l_se > vr_se)) + sleep_se = (l_se - vr_se) | 1; + + if (sleep_se >= vr_se) + score_se = u64_factor_m / (sleep_se / vr_se); + else + score_se = _2m - (u64_factor_m / (vr_se / sleep_se)); + + return score_se; +} + +static inline int is_interactive(struct cacule_node *cn) +{ + if (!interactivity_threshold || se_of(cn)->vruntime == 0) + return 0; + + return calc_interactivity(sched_clock(), cn) < interactivity_threshold; +} + +static inline int cn_has_idle_policy(struct cacule_node *se) +{ + return task_has_idle_policy(task_of(se_of(se))); +} + +static inline int +entity_before_cached(u64 now, unsigned int score_curr, struct cacule_node *se) +{ + unsigned int score_se; + int diff; + + /* + * if se has idle class, then no need to + * calculate, since we are sure that score_curr + * is a score for non idle class task + */ + if (cn_has_idle_policy(se)) + return -1; + + score_se = calc_interactivity(now, se); + diff = score_se - score_curr; + + if (diff <= 0) + return 1; + + return -1; +} + +/* + * Does se have lower interactivity score value (i.e. interactive) than curr? If yes, return 1, + * otherwise return -1 + * se is before curr if se has lower interactivity score value + * the lower score, the more interactive + */ +static inline int +entity_before(u64 now, struct cacule_node *curr, struct cacule_node *se) +{ + unsigned int score_curr, score_se; + int diff; + int is_curr_idle = cn_has_idle_policy(curr); + int is_se_idle = cn_has_idle_policy(se); + + /* if curr is normal but se is idle class, then no */ + if (!is_curr_idle && is_se_idle) + return -1; + + /* if curr is idle class and se is normal, then yes */ + if (is_curr_idle && !is_se_idle) + return 1; + + score_curr = calc_interactivity(now, curr); + score_se = calc_interactivity(now, se); + + diff = score_se - score_curr; + + if (diff < 0) + return 1; + + return -1; +} +/* + * Enqueue an entity + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se) +{ + struct cacule_node *se = &(_se->cacule_node); + struct cacule_node *iter, *next = NULL; + u64 now = sched_clock(); + unsigned int score_se = calc_interactivity(now, se); + int is_idle_task = cn_has_idle_policy(se); + + se->next = NULL; + se->prev = NULL; + + if (likely(cfs_rq->head)) { + + // start from tail + iter = cfs_rq->tail; + + /* + * if this task has idle class, then + * push it to the tail right away + */ + if (is_idle_task) + goto to_tail; + + /* here we know that this task isn't idle clas */ + + // does se have higher IS than iter? + while (iter && entity_before_cached(now, score_se, iter) == -1) { + next = iter; + iter = iter->prev; + } + + // se in tail position + if (iter == cfs_rq->tail) { +to_tail: + cfs_rq->tail->next = se; + se->prev = cfs_rq->tail; + + cfs_rq->tail = se; + } + // else if not head no tail, insert se after iter + else if (iter) { + se->next = next; + se->prev = iter; + + iter->next = se; + next->prev = se; + } + // insert se at head + else { + se->next = cfs_rq->head; + cfs_rq->head->prev = se; + + // lastly reset the head + cfs_rq->head = se; + } + } else { + // if empty rq + cfs_rq->head = se; + cfs_rq->tail = se; + } +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se) +{ + struct cacule_node *se = &(_se->cacule_node); + + // if only one se in rq + if (cfs_rq->head == cfs_rq->tail) { + cfs_rq->head = NULL; + cfs_rq->tail = NULL; + + } else if (se == cfs_rq->head) { + // if it is the head + cfs_rq->head = cfs_rq->head->next; + cfs_rq->head->prev = NULL; + } else if (se == cfs_rq->tail) { + // if it is the tail + cfs_rq->tail = cfs_rq->tail->prev; + cfs_rq->tail->next = NULL; + } else { + // if in the middle + struct cacule_node *prev = se->prev; + struct cacule_node *next = se->next; + + prev->next = next; + + if (next) + next->prev = prev; + } +} + +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +{ + return se_of(cfs_rq->head); +} +#else /* * Enqueue an entity into the rb-tree: */ @@ -608,16 +832,24 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) return __node_2_se(next); } +#endif /* CONFIG_CACULE_SCHED */ #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { +#ifdef CONFIG_CACULE_SCHED + if (!cfs_rq->tail) + return NULL; + + return se_of(cfs_rq->tail); +#else struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); if (!last) return NULL; return __node_2_se(last); +#endif /* CONFIG_CACULE_SCHED */ } /************************************************************** @@ -712,6 +944,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) return slice; } +#if !defined(CONFIG_CACULE_SCHED) /* * We calculate the vruntime slice of a to-be-inserted task. * @@ -721,6 +954,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { return calc_delta_fair(sched_slice(cfs_rq, se), se); } +#endif /* CONFIG_CACULE_SCHED */ #include "pelt.h" #ifdef CONFIG_SMP @@ -828,14 +1062,51 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_CACULE_SCHED +static void normalize_lifetime(u64 now, struct sched_entity *se) +{ + struct cacule_node *cn = &se->cacule_node; + u64 max_life_ns, life_time; + s64 diff; + + /* + * left shift 20 bits is approximately = * 1000000 + * we don't need the precision of life time + * Ex. for 30s, with left shift (20bits) == 31.457s + */ + max_life_ns = ((u64) cacule_max_lifetime) << 20; + life_time = now - cn->cacule_start_time; + diff = life_time - max_life_ns; + + if (diff > 0) { + // multiply life_time by 1024 for more precision + u64 old_hrrn_x = (life_time << 7) / ((cn->vruntime >> 3) | 1); + + // reset life to half max_life (i.e ~15s) + cn->cacule_start_time = now - (max_life_ns >> 1); + + // avoid division by zero + if (old_hrrn_x == 0) old_hrrn_x = 1; + + // reset vruntime based on old hrrn ratio + cn->vruntime = (max_life_ns << 9) / old_hrrn_x; + } +} +#endif /* CONFIG_CACULE_SCHED */ + /* * Update the current task's runtime statistics. */ static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; +#ifdef CONFIG_CACULE_SCHED + u64 now = sched_clock(); + u64 delta_exec, delta_fair; +#else u64 now = rq_clock_task(rq_of(cfs_rq)); u64 delta_exec; +#endif if (unlikely(!curr)) return; @@ -852,8 +1123,15 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); +#ifdef CONFIG_CACULE_SCHED + delta_fair = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_fair; + curr->cacule_node.vruntime += delta_fair; + normalize_lifetime(now, curr); +#else curr->vruntime += calc_delta_fair(delta_exec, curr); update_min_vruntime(cfs_rq); +#endif if (entity_is_task(curr)) { struct task_struct *curtask = task_of(curr); @@ -1021,7 +1299,6 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - if (!schedstat_enabled()) return; @@ -1053,7 +1330,11 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ +#ifdef CONFIG_CACULE_SCHED + se->exec_start = sched_clock(); +#else se->exec_start = rq_clock_task(rq_of(cfs_rq)); +#endif } /************************************************** @@ -4122,7 +4403,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) { -#ifdef CONFIG_SCHED_DEBUG +#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_CACULE_SCHED) s64 d = se->vruntime - cfs_rq->min_vruntime; if (d < 0) @@ -4133,6 +4414,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) #endif } +#if !defined(CONFIG_CACULE_SCHED) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { @@ -4164,6 +4446,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* ensure we never gain time by being placed backwards. */ se->vruntime = max_vruntime(se->vruntime, vruntime); } +#endif /* CONFIG_CACULE_SCHED */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4222,18 +4505,23 @@ static inline bool cfs_bandwidth_used(void); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { +#if !defined(CONFIG_CACULE_SCHED) bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); +#endif bool curr = cfs_rq->curr == se; +#if !defined(CONFIG_CACULE_SCHED) /* * If we're the current task, we must renormalise before calling * update_curr(). */ if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; +#endif update_curr(cfs_rq); +#if !defined(CONFIG_CACULE_SCHED) /* * Otherwise, renormalise after, such that we're placed at the current * moment in time, instead of some random moment in the past. Being @@ -4242,6 +4530,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; +#endif /* * When enqueuing a sched_entity, we must: @@ -4256,8 +4545,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); account_entity_enqueue(cfs_rq, se); +#if !defined(CONFIG_CACULE_SCHED) if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); +#endif check_schedstat_required(); update_stats_enqueue(cfs_rq, se, flags); @@ -4278,6 +4569,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_enqueue_throttle(cfs_rq); } +#if !defined(CONFIG_CACULE_SCHED) static void __clear_buddies_last(struct sched_entity *se) { for_each_sched_entity(se) { @@ -4322,6 +4614,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) if (cfs_rq->skip == se) __clear_buddies_skip(se); } +#endif /* !CONFIG_CACULE_SCHED */ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -4346,13 +4639,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_dequeue(cfs_rq, se, flags); +#if !defined(CONFIG_CACULE_SCHED) clear_buddies(cfs_rq, se); +#endif if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; account_entity_dequeue(cfs_rq, se); +#if !defined(CONFIG_CACULE_SCHED) /* * Normalize after update_curr(); which will also have moved * min_vruntime if @se is the one holding it back. But before doing @@ -4361,12 +4657,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; +#endif /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); update_cfs_group(se); +#if !defined(CONFIG_CACULE_SCHED) /* * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be @@ -4375,8 +4673,21 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); +#endif } +#ifdef CONFIG_CACULE_SCHED +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + // does head have higher IS than curr + if (entity_before(sched_clock(), &curr->cacule_node, cfs_rq->head) == 1) + resched_curr(rq_of(cfs_rq)); +} +#else /* * Preempt the current task with a newly woken task if needed: */ @@ -4416,6 +4727,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (delta > ideal_runtime) resched_curr(rq_of(cfs_rq)); } +#endif /* CONFIG_CACULE_SCHED */ static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4450,6 +4762,21 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +#ifdef CONFIG_CACULE_SCHED +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + struct cacule_node *se = cfs_rq->head; + + if (unlikely(!se)) + se = &curr->cacule_node; + else if (unlikely(curr + && entity_before(sched_clock(), se, &curr->cacule_node) == 1)) + se = &curr->cacule_node; + + return se_of(se); +} +#else static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); @@ -4510,6 +4837,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) return se; } +#endif /* CONFIG_CACULE_SCHED */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -5612,7 +5940,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } +#if !defined(CONFIG_CACULE_SCHED) static void set_next_buddy(struct sched_entity *se); +#endif /* * The dequeue_task method is called before nr_running is @@ -5644,12 +5974,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); +#if !defined(CONFIG_CACULE_SCHED) /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ if (task_sleep && se && !throttled_hierarchy(cfs_rq)) set_next_buddy(se); +#endif break; } flags |= DEQUEUE_SLEEP; @@ -5765,6 +6097,7 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +#if !defined(CONFIG_CACULE_SCHED) static void record_wakee(struct task_struct *p) { /* @@ -5811,6 +6144,7 @@ static int wake_wide(struct task_struct *p) return 0; return 1; } +#endif /* CONFIG_CACULE_SCHED */ /* * The purpose of wake_affine() is to quickly determine on which CPU we can run @@ -6513,6 +6847,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) return min_t(unsigned long, util, capacity_orig_of(cpu)); } +#if !defined(CONFIG_CACULE_SCHED) /* * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) * to @dst_cpu. @@ -6762,6 +7097,57 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return -1; } +#endif /* CONFIG_CACULE_SCHED */ + +#ifdef CONFIG_CACULE_SCHED +static int +find_least_IS_cpu(struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + unsigned int max_IS = 0; + unsigned int IS, IS_c, IS_h; + struct sched_entity *curr_se; + struct cacule_node *cn, *head; + int cpu_i; + int new_cpu = -1; + + for_each_online_cpu(cpu_i) { + if (!cpumask_test_cpu(cpu_i, p->cpus_ptr)) + continue; + + cn = NULL; + cfs_rq = &cpu_rq(cpu_i)->cfs; + + curr_se = cfs_rq->curr; + head = cfs_rq->head; + + if (!curr_se && head) + cn = head; + else if (curr_se && !head) + cn = &curr_se->cacule_node; + else if (curr_se && head) { + IS_c = calc_interactivity(sched_clock(), &curr_se->cacule_node); + IS_h = calc_interactivity(sched_clock(), head); + + IS = IS_c > IS_h? IS_c : IS_h; + goto compare; + } + + if (!cn) + return cpu_i; + + IS = calc_interactivity(sched_clock(), cn); + +compare: + if (IS > max_IS) { + max_IS = IS; + new_cpu = cpu_i; + } + } + + return new_cpu; +} +#endif /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -6786,6 +7172,26 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; +#ifdef CONFIG_CACULE_SCHED + struct sched_entity *se = &p->se; + + if (!is_interactive(&se->cacule_node)) + goto cfs_way; + + // check first if the prev cpu + // has 0 tasks + if (cpumask_test_cpu(prev_cpu, p->cpus_ptr) && + cpu_rq(prev_cpu)->cfs.nr_running == 0) + return prev_cpu; + + new_cpu = find_least_IS_cpu(p); + + if (new_cpu != -1) + return new_cpu; + + new_cpu = prev_cpu; +cfs_way: +#else if (wake_flags & WF_TTWU) { record_wakee(p); @@ -6798,6 +7204,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } +#endif /* CONFIG_CACULE_SCHED */ rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -6844,6 +7251,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se); */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { +#if !defined(CONFIG_CACULE_SCHED) /* * As blocked tasks retain absolute vruntime the migration needs to * deal with this by subtracting the old and adding the new @@ -6869,6 +7277,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) se->vruntime -= min_vruntime; } +#endif /* CONFIG_CACULE_SCHED */ if (p->on_rq == TASK_ON_RQ_MIGRATING) { /* @@ -6914,6 +7323,7 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ +#if !defined(CONFIG_CACULE_SCHED) static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6992,6 +7402,7 @@ static void set_skip_buddy(struct sched_entity *se) for_each_sched_entity(se) cfs_rq_of(se)->skip = se; } +#endif /* CONFIG_CACULE_SCHED */ /* * Preempt the current task with a newly woken task if needed: @@ -7000,9 +7411,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; + +#if !defined(CONFIG_CACULE_SCHED) struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; +#endif /* CONFIG_CACULE_SCHED */ if (unlikely(se == pse)) return; @@ -7016,10 +7430,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; +#if !defined(CONFIG_CACULE_SCHED) if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); next_buddy_marked = 1; } +#endif /* CONFIG_CACULE_SCHED */ /* * We can come here with TIF_NEED_RESCHED already set from new task @@ -7049,6 +7465,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ find_matching_se(&se, &pse); update_curr(cfs_rq_of(se)); BUG_ON(!pse); + +#ifdef CONFIG_CACULE_SCHED + if (entity_before(sched_clock(), &se->cacule_node, &pse->cacule_node) == 1) + goto preempt; +#else if (wakeup_preempt_entity(se, pse) == 1) { /* * Bias pick_next to pick the sched entity that is @@ -7058,11 +7479,14 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ set_next_buddy(pse); goto preempt; } +#endif /* CONFIG_CACULE_SCHED */ return; preempt: resched_curr(rq); + +#if !defined(CONFIG_CACULE_SCHED) /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -7077,6 +7501,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) set_last_buddy(se); +#endif /* CONFIG_CACULE_SCHED */ } struct task_struct * @@ -7251,7 +7676,10 @@ static void yield_task_fair(struct rq *rq) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + +#if !defined(CONFIG_CACULE_SCHED) struct sched_entity *se = &curr->se; +#endif /* * Are we the only task in the tree? @@ -7259,7 +7687,9 @@ static void yield_task_fair(struct rq *rq) if (unlikely(rq->nr_running == 1)) return; +#if !defined(CONFIG_CACULE_SCHED) clear_buddies(cfs_rq, se); +#endif if (curr->policy != SCHED_BATCH) { update_rq_clock(rq); @@ -7275,7 +7705,9 @@ static void yield_task_fair(struct rq *rq) rq_clock_skip_update(rq); } +#if !defined(CONFIG_CACULE_SCHED) set_skip_buddy(se); +#endif } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -7286,8 +7718,10 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; +#if !defined(CONFIG_CACULE_SCHED) /* Tell the scheduler that we'd really like pse to run next. */ set_next_buddy(se); +#endif yield_task_fair(rq); @@ -7515,6 +7949,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (env->sd->flags & SD_SHARE_CPUCAPACITY) return 0; +#if !defined(CONFIG_CACULE_SCHED) /* * Buddy candidates are cache hot: */ @@ -7522,6 +7957,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) (&p->se == cfs_rq_of(&p->se)->next || &p->se == cfs_rq_of(&p->se)->last)) return 1; +#endif if (sysctl_sched_migration_cost == -1) return 1; @@ -10585,9 +11021,11 @@ static void nohz_newidle_balance(struct rq *this_rq) if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) return; +#if !defined(CONFIG_CACULE_SCHED) /* Will wake up very soon. No time for doing anything else*/ if (this_rq->avg_idle < sysctl_sched_migration_cost) return; +#endif /* Don't need to update blocked load of idle CPUs*/ if (!READ_ONCE(nohz.has_blocked) || @@ -10655,7 +11093,10 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - if (this_rq->avg_idle < sysctl_sched_migration_cost || + if ( +#if !defined(CONFIG_CACULE_SCHED) + this_rq->avg_idle < sysctl_sched_migration_cost || +#endif !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); @@ -10823,11 +11264,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) update_overutilized_status(task_rq(curr)); } +#ifdef CONFIG_CACULE_SCHED /* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled */ + static void task_fork_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *curr; + struct rq *rq = this_rq(); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); + + cfs_rq = task_cfs_rq(current); + curr = cfs_rq->curr; + if (curr) + update_curr(cfs_rq); + + rq_unlock(rq, &rf); +} +#else static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; @@ -10858,6 +11318,7 @@ static void task_fork_fair(struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; rq_unlock(rq, &rf); } +#endif /* CONFIG_CACULE_SCHED */ /* * Priority of the task has changed. Check to see if we preempt @@ -10976,6 +11437,8 @@ static void attach_entity_cfs_rq(struct sched_entity *se) static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; + +#if !defined(CONFIG_CACULE_SCHED) struct cfs_rq *cfs_rq = cfs_rq_of(se); if (!vruntime_normalized(p)) { @@ -10986,6 +11449,7 @@ static void detach_task_cfs_rq(struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } +#endif detach_entity_cfs_rq(se); } @@ -10993,12 +11457,17 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; + +#if !defined(CONFIG_CACULE_SCHED) struct cfs_rq *cfs_rq = cfs_rq_of(se); +#endif attach_entity_cfs_rq(se); +#if !defined(CONFIG_CACULE_SCHED) if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; +#endif } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -11054,13 +11523,22 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; + +#if !defined(CONFIG_CACULE_SCHED) cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#endif /* CONFIG_CACULE_SCHED */ + #ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); #endif + +#ifdef CONFIG_CACULE_SCHED + cfs_rq->head = NULL; + cfs_rq->tail = NULL; +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e4e4f47cee6a..0eb4fca83ffe 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -523,10 +523,13 @@ struct cfs_rq { unsigned int idle_h_nr_running; /* SCHED_IDLE */ u64 exec_clock; + +#if !defined(CONFIG_CACULE_SCHED) u64 min_vruntime; #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif +#endif /* CONFIG_CACULE_SCHED */ struct rb_root_cached tasks_timeline; @@ -535,9 +538,15 @@ struct cfs_rq { * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; +#ifdef CONFIG_CACULE_SCHED + struct cacule_node *head; + struct cacule_node *tail; + +#else struct sched_entity *next; struct sched_entity *last; struct sched_entity *skip; +#endif // CONFIG_CACULE_SCHED #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62fbd09b5dc1..8cbb8c5663d3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1659,6 +1659,29 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_CACULE_SCHED + { + .procname = "sched_interactivity_factor", + .data = &interactivity_factor, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_interactivity_threshold", + .data = &interactivity_threshold, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_max_lifetime_ms", + .data = &cacule_max_lifetime, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns",